Spaces:
Running
Running
modularize + nest scripts; reduce technical debt
Browse files- .clinerules/hocr-basics-api.md +106 -0
- .clinerules/project-brief.md +21 -0
- .gitignore +1 -0
- app.py +1 -1
- config.py +9 -14
- ocr_processing.py +90 -86
- requirements.txt +1 -1
- structured_ocr.py +0 -0
- ui/ui_components.py +590 -0
- utils/helpers/language_detection.py +373 -0
- utils/helpers/letterhead_handler.py +82 -0
- utils/helpers/ocr_text_repair.py +270 -0
- utils/pdf_ocr.py +457 -0
.clinerules/hocr-basics-api.md
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# HOCR Basics: API Integrations (Streamlit and Mistral OCR)
|
2 |
+
|
3 |
+
This rule defines the essential development standards for integrating the Mistral OCR API and using Streamlit components in the `milwright/historical-ocr` application.
|
4 |
+
|
5 |
+
## 📌 Rule 1: Mistral OCR API Usage
|
6 |
+
|
7 |
+
* **Endpoint:**
|
8 |
+
`POST https://api.mistral.ai/v1/ocr`
|
9 |
+
|
10 |
+
* **Headers:**
|
11 |
+
|
12 |
+
```http
|
13 |
+
Authorization: Bearer YOUR_API_KEY
|
14 |
+
Content-Type: application/json
|
15 |
+
```
|
16 |
+
|
17 |
+
* **Required JSON Body Fields:**
|
18 |
+
|
19 |
+
```json
|
20 |
+
{
|
21 |
+
"file_url": "https://example.com/your.pdf"
|
22 |
+
}
|
23 |
+
```
|
24 |
+
|
25 |
+
* **Expected Response Fields:**
|
26 |
+
|
27 |
+
* `text`: Raw OCR output
|
28 |
+
* `metadata`: Document structure, language, layout information
|
29 |
+
|
30 |
+
> **Note:** Always validate presence of required fields and handle error codes gracefully.
|
31 |
+
|
32 |
+
---
|
33 |
+
|
34 |
+
### 🖼️ Rule 2: Streamlit Usage Standards
|
35 |
+
|
36 |
+
* Use these core components:
|
37 |
+
|
38 |
+
* `st.file_uploader()`
|
39 |
+
* `st.selectbox()`
|
40 |
+
* `st.image()`
|
41 |
+
* `st.markdown()`
|
42 |
+
* `st.download_button()`
|
43 |
+
|
44 |
+
* Always set:
|
45 |
+
`use_container_width=True` for responsive display where supported
|
46 |
+
|
47 |
+
* Avoid global state; prefer `st.session_state` for interactivity and stateful inputs
|
48 |
+
|
49 |
+
## Mistral OCR Examples
|
50 |
+
|
51 |
+
``` json
|
52 |
+
{
|
53 |
+
"id": "string",
|
54 |
+
"object": "model",
|
55 |
+
"created": 0,{
|
56 |
+
"model": "string",
|
57 |
+
"id": "string",
|
58 |
+
"document": {
|
59 |
+
"document_url": "string",
|
60 |
+
"document_name": "string",
|
61 |
+
"type": "document_url"
|
62 |
+
},
|
63 |
+
"pages": [
|
64 |
+
0
|
65 |
+
],
|
66 |
+
"include_image_base64": true,
|
67 |
+
"image_limit": 0,
|
68 |
+
"image_min_size": 0
|
69 |
+
}
|
70 |
+
```
|
71 |
+
|
72 |
+
``` json
|
73 |
+
{
|
74 |
+
"pages": [
|
75 |
+
{
|
76 |
+
"index": 0,
|
77 |
+
"markdown": "string",
|
78 |
+
"images": [
|
79 |
+
{
|
80 |
+
"id": "string",
|
81 |
+
"top_left_x": 0,
|
82 |
+
"top_left_y": 0,
|
83 |
+
"bottom_right_x": 0,
|
84 |
+
"bottom_right_y": 0,
|
85 |
+
"image_base64": "string"
|
86 |
+
}
|
87 |
+
],
|
88 |
+
"dimensions": {
|
89 |
+
"dpi": 0,
|
90 |
+
"height": 0,
|
91 |
+
"width": 0
|
92 |
+
}
|
93 |
+
}
|
94 |
+
],
|
95 |
+
"model": "string",
|
96 |
+
"usage_info": {
|
97 |
+
"pages_processed": 0,
|
98 |
+
"doc_size_bytes": 0
|
99 |
+
}
|
100 |
+
}
|
101 |
+
```
|
102 |
+
|
103 |
+
### Links and Resources to Understand
|
104 |
+
|
105 |
+
* [URL to Mistral OCR APi doc](https://docs.mistral.ai/api/#tag/batch/operation/jobs_api_routes_batch_cancel_batch_job)
|
106 |
+
* [URL to Streamlit API documentation](https://docs.streamlit.io/develop/api-reference)
|
.clinerules/project-brief.md
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Project Brief
|
2 |
+
|
3 |
+
Historical OCR is an advanced optical character recognition (OCR) application designed to support historical research. It leverages Mistral AI's OCR models alongside image preprocessing pipelines optimized for archival material.
|
4 |
+
|
5 |
+
High-Level Overview
|
6 |
+
|
7 |
+
Building a Streamlit-based web application to process historical documents (images or PDFs), optimize them for OCR using advanced preprocessing techniques, and extract structured text and metadata through Mistral's large language models.
|
8 |
+
|
9 |
+
Core Requirements and Goals
|
10 |
+
|
11 |
+
Upload and preprocess historical documents
|
12 |
+
|
13 |
+
Automatically detect document types (e.g., handwritten letters, scientific papers)
|
14 |
+
|
15 |
+
Apply tailored OCR prompting and structured output based on document type
|
16 |
+
|
17 |
+
Support user-defined contextual instructions to refine output
|
18 |
+
|
19 |
+
Provide downloadable structured transcripts and analysis
|
20 |
+
|
21 |
+
Example: "Building a Streamlit web app for OCR transcription and structured extraction from historical documents using Mistral AI."
|
.gitignore
CHANGED
@@ -32,3 +32,4 @@ input/*.pdf
|
|
32 |
|
33 |
# Temporary documents
|
34 |
Tmplf6xnkgr*
|
|
|
|
32 |
|
33 |
# Temporary documents
|
34 |
Tmplf6xnkgr*
|
35 |
+
.env
|
app.py
CHANGED
@@ -20,7 +20,7 @@ import streamlit as st
|
|
20 |
# Local application/module imports
|
21 |
from preprocessing import convert_pdf_to_images, preprocess_image
|
22 |
from ocr_processing import process_file
|
23 |
-
from ui_components import (
|
24 |
ProgressReporter,
|
25 |
create_sidebar_options,
|
26 |
display_results,
|
|
|
20 |
# Local application/module imports
|
21 |
from preprocessing import convert_pdf_to_images, preprocess_image
|
22 |
from ocr_processing import process_file
|
23 |
+
from ui.ui_components import (
|
24 |
ProgressReporter,
|
25 |
create_sidebar_options,
|
26 |
display_results,
|
config.py
CHANGED
@@ -17,39 +17,34 @@ load_dotenv()
|
|
17 |
# Priority order:
|
18 |
# 1. HF_API_KEY environment variable (Hugging Face standard)
|
19 |
# 2. HUGGING_FACE_API_KEY environment variable (alternative name)
|
20 |
-
# 3.
|
21 |
-
# 4.
|
|
|
22 |
|
23 |
MISTRAL_API_KEY = os.environ.get("HF_API_KEY",
|
24 |
os.environ.get("HUGGING_FACE_API_KEY",
|
25 |
-
os.environ.get("
|
|
|
26 |
|
27 |
if not MISTRAL_API_KEY:
|
28 |
logger.warning("No Mistral API key found in environment variables. API functionality will be limited.")
|
29 |
|
30 |
# Check if we're in test mode (allows operation without valid API key)
|
31 |
-
# Set to False to use actual API calls
|
32 |
TEST_MODE = False
|
33 |
|
34 |
-
# Just check if API key exists
|
35 |
-
if not MISTRAL_API_KEY and not TEST_MODE:
|
36 |
-
logger.warning("No Mistral API key found. OCR functionality will not work unless TEST_MODE is enabled.")
|
37 |
-
|
38 |
-
if TEST_MODE:
|
39 |
-
logger.info("TEST_MODE is enabled. Using mock responses instead of actual API calls.")
|
40 |
-
|
41 |
# Model settings with fallbacks
|
42 |
OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
|
43 |
TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
|
44 |
-
VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") #
|
45 |
|
46 |
# Image preprocessing settings optimized for historical documents
|
47 |
# These can be customized from environment variables
|
48 |
IMAGE_PREPROCESSING = {
|
49 |
-
"enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "
|
50 |
"sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
|
51 |
"denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
|
52 |
-
"max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "
|
53 |
"target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
|
54 |
"compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "100")), # Higher quality for better OCR results
|
55 |
# # Enhanced settings for handwritten documents
|
|
|
17 |
# Priority order:
|
18 |
# 1. HF_API_KEY environment variable (Hugging Face standard)
|
19 |
# 2. HUGGING_FACE_API_KEY environment variable (alternative name)
|
20 |
+
# 3. HF_MISTRAL_API_KEY environment variable (for Hugging Face deployment)
|
21 |
+
# 4. MISTRAL_API_KEY environment variable (fallback)
|
22 |
+
# 5. Empty string (will show warning in app)
|
23 |
|
24 |
MISTRAL_API_KEY = os.environ.get("HF_API_KEY",
|
25 |
os.environ.get("HUGGING_FACE_API_KEY",
|
26 |
+
os.environ.get("HF_MISTRAL_API_KEY",
|
27 |
+
os.environ.get("MISTRAL_API_KEY", "")))).strip()
|
28 |
|
29 |
if not MISTRAL_API_KEY:
|
30 |
logger.warning("No Mistral API key found in environment variables. API functionality will be limited.")
|
31 |
|
32 |
# Check if we're in test mode (allows operation without valid API key)
|
33 |
+
# Set to False to use actual API calls with Mistral API
|
34 |
TEST_MODE = False
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
# Model settings with fallbacks
|
37 |
OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
|
38 |
TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
|
39 |
+
VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") # faster model that supports vision
|
40 |
|
41 |
# Image preprocessing settings optimized for historical documents
|
42 |
# These can be customized from environment variables
|
43 |
IMAGE_PREPROCESSING = {
|
44 |
+
"enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "3.5")), # Increased contrast for better text recognition
|
45 |
"sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
|
46 |
"denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
|
47 |
+
"max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "200.0")), # Increased size limit for better quality
|
48 |
"target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
|
49 |
"compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "100")), # Higher quality for better OCR results
|
50 |
# # Enhanced settings for handwritten documents
|
ocr_processing.py
CHANGED
@@ -82,7 +82,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
82 |
|
83 |
# Create a container for progress indicators if not provided
|
84 |
if progress_reporter is None:
|
85 |
-
from ui_components import ProgressReporter
|
86 |
progress_reporter = ProgressReporter(st.empty()).setup()
|
87 |
|
88 |
# Initialize temporary file paths list
|
@@ -119,10 +119,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
119 |
|
120 |
# For PDFs, we need to handle differently
|
121 |
if file_type == "pdf":
|
122 |
-
progress_reporter.update(20, "
|
123 |
-
|
124 |
-
# Process PDF with direct handling
|
125 |
-
progress_reporter.update(30, "Processing PDF with OCR...")
|
126 |
|
127 |
# Create a temporary file for processing
|
128 |
temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
|
@@ -145,91 +142,98 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
145 |
custom_prompt
|
146 |
)
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
# Process with cached function if possible
|
149 |
try:
|
150 |
-
|
151 |
-
|
152 |
-
modified_custom_prompt = custom_prompt
|
153 |
-
|
154 |
-
# Add PDF-specific instructions
|
155 |
-
if not modified_custom_prompt:
|
156 |
-
modified_custom_prompt = "This is a multi-page PDF document."
|
157 |
-
elif "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
|
158 |
-
modified_custom_prompt += " This is a multi-page PDF document."
|
159 |
-
|
160 |
-
# Update the cache key with the modified prompt
|
161 |
-
if modified_custom_prompt != custom_prompt:
|
162 |
-
cache_key = generate_cache_key(
|
163 |
-
open(temp_path, 'rb').read(),
|
164 |
-
file_type,
|
165 |
-
use_vision,
|
166 |
-
preprocessing_options,
|
167 |
-
pdf_rotation,
|
168 |
-
modified_custom_prompt
|
169 |
-
)
|
170 |
-
|
171 |
-
result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options), modified_custom_prompt)
|
172 |
progress_reporter.update(90, "Finalizing results...")
|
173 |
except Exception as e:
|
174 |
-
logger.warning(f"Cached processing failed: {str(e)}.
|
175 |
-
progress_reporter.update(60, f"Processing error: {str(e)}.
|
176 |
-
|
177 |
-
# If caching fails, process directly
|
178 |
-
processor = StructuredOCR()
|
179 |
-
|
180 |
|
181 |
-
#
|
182 |
-
doc_type = preprocessing_options.get("document_type", "standard")
|
183 |
-
modified_custom_prompt = custom_prompt
|
184 |
-
|
185 |
-
# Check for letterhead/marginalia document types with specialized handling
|
186 |
try:
|
187 |
-
from
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
doc_type = "letterhead"
|
203 |
except ImportError:
|
204 |
-
logger.
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
# Add PDF-specific instructions if needed
|
219 |
-
if "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
|
220 |
-
modified_custom_prompt += " This is a multi-page PDF document."
|
221 |
-
|
222 |
-
# Process directly with optimized settings
|
223 |
-
result = processor.process_file(
|
224 |
-
file_path=temp_path,
|
225 |
-
file_type="pdf",
|
226 |
-
use_vision=use_vision,
|
227 |
-
custom_prompt=modified_custom_prompt,
|
228 |
-
file_size_mb=file_size_mb,
|
229 |
-
pdf_rotation=pdf_rotation
|
230 |
-
)
|
231 |
-
|
232 |
-
progress_reporter.update(90, "Finalizing results...")
|
233 |
else:
|
234 |
# For image files
|
235 |
progress_reporter.update(20, "Preparing image for processing...")
|
@@ -390,7 +394,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
390 |
|
391 |
# Check for letterhead/marginalia document types with specialized handling
|
392 |
try:
|
393 |
-
from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
|
394 |
# Extract text density features if available
|
395 |
features = None
|
396 |
if 'text_density' in preprocessing_options:
|
@@ -453,7 +457,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
453 |
|
454 |
# Check for letterhead/marginalia document types with specialized handling
|
455 |
try:
|
456 |
-
from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
|
457 |
# Extract text density features if available
|
458 |
features = None
|
459 |
if 'text_density' in preprocessing_options:
|
@@ -503,7 +507,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
503 |
|
504 |
# Check for duplicated text patterns that indicate handwritten text issues
|
505 |
try:
|
506 |
-
from ocr_text_repair import detect_duplicate_text_issues, get_enhanced_preprocessing_options, get_handwritten_specific_prompt, clean_duplicated_text
|
507 |
|
508 |
# Check OCR output for duplication issues
|
509 |
if result and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
|
|
|
82 |
|
83 |
# Create a container for progress indicators if not provided
|
84 |
if progress_reporter is None:
|
85 |
+
from ui.ui_components import ProgressReporter
|
86 |
progress_reporter = ProgressReporter(st.empty()).setup()
|
87 |
|
88 |
# Initialize temporary file paths list
|
|
|
119 |
|
120 |
# For PDFs, we need to handle differently
|
121 |
if file_type == "pdf":
|
122 |
+
progress_reporter.update(20, "Preparing PDF document...")
|
|
|
|
|
|
|
123 |
|
124 |
# Create a temporary file for processing
|
125 |
temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
|
|
|
142 |
custom_prompt
|
143 |
)
|
144 |
|
145 |
+
# Use the document type information from preprocessing options
|
146 |
+
doc_type = preprocessing_options.get("document_type", "standard")
|
147 |
+
modified_custom_prompt = custom_prompt
|
148 |
+
|
149 |
+
# Enhance the prompt with document-type specific instructions
|
150 |
+
# Check for letterhead/marginalia document types with specialized handling
|
151 |
+
try:
|
152 |
+
from utils.helpers.letterhead_handler import get_letterhead_prompt, is_likely_letterhead
|
153 |
+
# Extract text density features if available
|
154 |
+
features = None
|
155 |
+
if 'text_density' in preprocessing_options:
|
156 |
+
features = preprocessing_options['text_density']
|
157 |
+
|
158 |
+
# Check if this looks like a letterhead document
|
159 |
+
if is_likely_letterhead(temp_path, features):
|
160 |
+
# Get specialized letterhead prompt
|
161 |
+
letterhead_prompt = get_letterhead_prompt(temp_path, features)
|
162 |
+
if letterhead_prompt:
|
163 |
+
logger.info(f"Using specialized letterhead prompt for document")
|
164 |
+
modified_custom_prompt = letterhead_prompt
|
165 |
+
# Set document type for tracking
|
166 |
+
preprocessing_options["document_type"] = "letterhead"
|
167 |
+
doc_type = "letterhead"
|
168 |
+
except ImportError:
|
169 |
+
logger.debug("Letterhead handler not available")
|
170 |
+
|
171 |
+
# Add document-type specific instructions based on preprocessing options
|
172 |
+
if doc_type == "handwritten" and not modified_custom_prompt:
|
173 |
+
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
174 |
+
elif doc_type == "handwritten" and "handwritten" not in modified_custom_prompt.lower():
|
175 |
+
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
176 |
+
elif doc_type == "newspaper" and not modified_custom_prompt:
|
177 |
+
modified_custom_prompt = "This is a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
|
178 |
+
elif doc_type == "newspaper" and "column" not in modified_custom_prompt.lower() and "newspaper" not in modified_custom_prompt.lower():
|
179 |
+
modified_custom_prompt += " This appears to be a newspaper or document with columns. Please extract all text content from each column."
|
180 |
+
elif doc_type == "book" and not modified_custom_prompt:
|
181 |
+
modified_custom_prompt = "This is a book page. Extract titles, headers, footnotes, and body text, preserving paragraph structure and formatting."
|
182 |
+
|
183 |
+
# Update the cache key with the modified prompt
|
184 |
+
if modified_custom_prompt != custom_prompt:
|
185 |
+
cache_key = generate_cache_key(
|
186 |
+
open(temp_path, 'rb').read(),
|
187 |
+
file_type,
|
188 |
+
use_vision,
|
189 |
+
preprocessing_options,
|
190 |
+
pdf_rotation,
|
191 |
+
modified_custom_prompt
|
192 |
+
)
|
193 |
+
|
194 |
+
progress_reporter.update(30, "Processing PDF with enhanced OCR...")
|
195 |
+
|
196 |
# Process with cached function if possible
|
197 |
try:
|
198 |
+
result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key,
|
199 |
+
str(preprocessing_options), modified_custom_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
progress_reporter.update(90, "Finalizing results...")
|
201 |
except Exception as e:
|
202 |
+
logger.warning(f"Cached processing failed: {str(e)}. Using direct processing.")
|
203 |
+
progress_reporter.update(60, f"Processing error: {str(e)}. Using enhanced PDF processor...")
|
|
|
|
|
|
|
|
|
204 |
|
205 |
+
# Import the enhanced PDF processor
|
|
|
|
|
|
|
|
|
206 |
try:
|
207 |
+
from utils.pdf_ocr import PDFOCR
|
208 |
+
|
209 |
+
# Use our specialized PDF processor
|
210 |
+
pdf_processor = PDFOCR()
|
211 |
+
|
212 |
+
# Process with the enhanced PDF processor
|
213 |
+
result = pdf_processor.process_pdf(
|
214 |
+
pdf_path=temp_path,
|
215 |
+
use_vision=use_vision,
|
216 |
+
max_pages=max_pages,
|
217 |
+
custom_prompt=modified_custom_prompt
|
218 |
+
)
|
219 |
+
|
220 |
+
logger.info("PDF successfully processed with enhanced PDF processor")
|
221 |
+
progress_reporter.update(90, "Finalizing results...")
|
|
|
222 |
except ImportError:
|
223 |
+
logger.warning("Enhanced PDF processor not available. Falling back to standard processing.")
|
224 |
+
progress_reporter.update(70, "Falling back to standard PDF processing...")
|
225 |
+
|
226 |
+
# If enhanced processor is not available, fall back to direct StructuredOCR processing
|
227 |
+
processor = StructuredOCR()
|
228 |
+
result = processor.process_file(
|
229 |
+
file_path=temp_path,
|
230 |
+
file_type="pdf",
|
231 |
+
use_vision=use_vision,
|
232 |
+
custom_prompt=modified_custom_prompt,
|
233 |
+
file_size_mb=file_size_mb,
|
234 |
+
max_pages=max_pages
|
235 |
+
)
|
236 |
+
progress_reporter.update(90, "Finalizing results...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
else:
|
238 |
# For image files
|
239 |
progress_reporter.update(20, "Preparing image for processing...")
|
|
|
394 |
|
395 |
# Check for letterhead/marginalia document types with specialized handling
|
396 |
try:
|
397 |
+
from utils.helpers.letterhead_handler import get_letterhead_prompt, is_likely_letterhead
|
398 |
# Extract text density features if available
|
399 |
features = None
|
400 |
if 'text_density' in preprocessing_options:
|
|
|
457 |
|
458 |
# Check for letterhead/marginalia document types with specialized handling
|
459 |
try:
|
460 |
+
from utils.helpers.letterhead_handler import get_letterhead_prompt, is_likely_letterhead
|
461 |
# Extract text density features if available
|
462 |
features = None
|
463 |
if 'text_density' in preprocessing_options:
|
|
|
507 |
|
508 |
# Check for duplicated text patterns that indicate handwritten text issues
|
509 |
try:
|
510 |
+
from utils.helpers.ocr_text_repair import detect_duplicate_text_issues, get_enhanced_preprocessing_options, get_handwritten_specific_prompt, clean_duplicated_text
|
511 |
|
512 |
# Check OCR output for duplication issues
|
513 |
if result and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
|
requirements.txt
CHANGED
@@ -9,7 +9,7 @@ pydantic>=2.5.0 # Updated for better BaseModel support
|
|
9 |
Pillow>=10.0.0
|
10 |
opencv-python-headless>=4.8.0.74
|
11 |
pdf2image>=1.16.0
|
12 |
-
|
13 |
matplotlib>=3.7.0 # For visualization in preprocessing tests
|
14 |
|
15 |
# Data handling and utilities
|
|
|
9 |
Pillow>=10.0.0
|
10 |
opencv-python-headless>=4.8.0.74
|
11 |
pdf2image>=1.16.0
|
12 |
+
pytesseract>=0.3.10 # For local OCR fallback
|
13 |
matplotlib>=3.7.0 # For visualization in preprocessing tests
|
14 |
|
15 |
# Data handling and utilities
|
structured_ocr.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
ui/ui_components.py
ADDED
@@ -0,0 +1,590 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import io
|
4 |
+
import base64
|
5 |
+
import logging
|
6 |
+
import re
|
7 |
+
from datetime import datetime
|
8 |
+
from pathlib import Path
|
9 |
+
import json
|
10 |
+
|
11 |
+
# Define exports
|
12 |
+
__all__ = [
|
13 |
+
'ProgressReporter',
|
14 |
+
'create_sidebar_options',
|
15 |
+
'create_file_uploader',
|
16 |
+
'display_document_with_images',
|
17 |
+
'display_previous_results',
|
18 |
+
'display_about_tab',
|
19 |
+
'display_results' # Re-export from utils.ui_utils
|
20 |
+
]
|
21 |
+
from constants import (
|
22 |
+
DOCUMENT_TYPES,
|
23 |
+
DOCUMENT_LAYOUTS,
|
24 |
+
CUSTOM_PROMPT_TEMPLATES,
|
25 |
+
LAYOUT_PROMPT_ADDITIONS,
|
26 |
+
DEFAULT_PDF_DPI,
|
27 |
+
MIN_PDF_DPI,
|
28 |
+
MAX_PDF_DPI,
|
29 |
+
DEFAULT_MAX_PAGES,
|
30 |
+
PERFORMANCE_MODES,
|
31 |
+
PREPROCESSING_DOC_TYPES,
|
32 |
+
ROTATION_OPTIONS
|
33 |
+
)
|
34 |
+
from utils.text_utils import format_ocr_text, clean_raw_text, format_markdown_text # Import from text_utils
|
35 |
+
from utils.content_utils import (
|
36 |
+
classify_document_content,
|
37 |
+
extract_document_text,
|
38 |
+
extract_image_description
|
39 |
+
)
|
40 |
+
from utils.ui_utils import display_results
|
41 |
+
from preprocessing import preprocess_image
|
42 |
+
|
43 |
+
class ProgressReporter:
|
44 |
+
"""Class to handle progress reporting in the UI"""
|
45 |
+
|
46 |
+
def __init__(self, placeholder):
|
47 |
+
self.placeholder = placeholder
|
48 |
+
self.progress_bar = None
|
49 |
+
self.status_text = None
|
50 |
+
|
51 |
+
def setup(self):
|
52 |
+
"""Setup the progress components"""
|
53 |
+
with self.placeholder.container():
|
54 |
+
self.progress_bar = st.progress(0)
|
55 |
+
self.status_text = st.empty()
|
56 |
+
return self
|
57 |
+
|
58 |
+
def update(self, percent, status_text):
|
59 |
+
"""Update the progress bar and status text"""
|
60 |
+
if self.progress_bar is not None:
|
61 |
+
self.progress_bar.progress(percent / 100)
|
62 |
+
if self.status_text is not None:
|
63 |
+
self.status_text.text(status_text)
|
64 |
+
|
65 |
+
def complete(self, success=True):
|
66 |
+
"""Complete the progress reporting"""
|
67 |
+
if success:
|
68 |
+
if self.progress_bar is not None:
|
69 |
+
self.progress_bar.progress(100)
|
70 |
+
if self.status_text is not None:
|
71 |
+
self.status_text.text("Processing complete!")
|
72 |
+
else:
|
73 |
+
if self.status_text is not None:
|
74 |
+
self.status_text.text("Processing failed.")
|
75 |
+
|
76 |
+
# Clear the progress components after a delay
|
77 |
+
import time
|
78 |
+
time.sleep(0.8) # Short delay to show completion
|
79 |
+
if self.progress_bar is not None:
|
80 |
+
self.progress_bar.empty()
|
81 |
+
if self.status_text is not None:
|
82 |
+
self.status_text.empty()
|
83 |
+
|
84 |
+
def create_sidebar_options():
|
85 |
+
"""Create and return sidebar options"""
|
86 |
+
with st.sidebar:
|
87 |
+
st.markdown("## OCR Settings")
|
88 |
+
|
89 |
+
# Create a container for the sidebar options
|
90 |
+
with st.container():
|
91 |
+
# Default to using vision model (removed selection from UI)
|
92 |
+
use_vision = True
|
93 |
+
|
94 |
+
# Document type selection
|
95 |
+
doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
|
96 |
+
help="Select the type of document you're processing for better results")
|
97 |
+
|
98 |
+
# Document layout
|
99 |
+
doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS,
|
100 |
+
help="Select the layout of your document")
|
101 |
+
|
102 |
+
# Initialize preprocessing variables with default values
|
103 |
+
grayscale = False
|
104 |
+
denoise = False
|
105 |
+
contrast = 0
|
106 |
+
rotation = 0
|
107 |
+
use_segmentation = False
|
108 |
+
|
109 |
+
# Custom prompt
|
110 |
+
custom_prompt = ""
|
111 |
+
# Get the template for the selected document type if not auto-detect
|
112 |
+
if doc_type != DOCUMENT_TYPES[0]:
|
113 |
+
prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "")
|
114 |
+
|
115 |
+
# Add layout information if not standard
|
116 |
+
if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout
|
117 |
+
layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "")
|
118 |
+
if layout_addition:
|
119 |
+
prompt_template += " " + layout_addition
|
120 |
+
|
121 |
+
# Set the custom prompt
|
122 |
+
custom_prompt = prompt_template
|
123 |
+
|
124 |
+
# Allow user to edit the prompt (always visible)
|
125 |
+
custom_prompt = st.text_area("Custom Processing Instructions", value=custom_prompt,
|
126 |
+
help="Customize the instructions for processing this document",
|
127 |
+
height=80)
|
128 |
+
|
129 |
+
# Image preprocessing options (always visible)
|
130 |
+
st.markdown("### Image Preprocessing")
|
131 |
+
|
132 |
+
# Grayscale conversion
|
133 |
+
grayscale = st.checkbox("Convert to Grayscale",
|
134 |
+
value=True,
|
135 |
+
help="Convert color images to grayscale for better text recognition")
|
136 |
+
|
137 |
+
# Light denoising option
|
138 |
+
denoise = st.checkbox("Light Denoising",
|
139 |
+
value=True,
|
140 |
+
help="Apply gentle denoising to improve text clarity")
|
141 |
+
|
142 |
+
# Contrast adjustment
|
143 |
+
contrast = st.slider("Contrast Adjustment",
|
144 |
+
min_value=-20,
|
145 |
+
max_value=20,
|
146 |
+
value=5,
|
147 |
+
step=5,
|
148 |
+
help="Adjust image contrast (limited range)")
|
149 |
+
|
150 |
+
|
151 |
+
# Initialize rotation (keeping it set to 0)
|
152 |
+
rotation = 0
|
153 |
+
use_segmentation = False
|
154 |
+
|
155 |
+
# Create preprocessing options dictionary
|
156 |
+
# Map UI document types to preprocessing document types
|
157 |
+
doc_type_for_preprocessing = "standard"
|
158 |
+
if "Handwritten" in doc_type:
|
159 |
+
doc_type_for_preprocessing = "handwritten"
|
160 |
+
elif "Newspaper" in doc_type or "Magazine" in doc_type:
|
161 |
+
doc_type_for_preprocessing = "newspaper"
|
162 |
+
elif "Book" in doc_type or "Publication" in doc_type:
|
163 |
+
doc_type_for_preprocessing = "book" # Match the actual preprocessing type
|
164 |
+
|
165 |
+
preprocessing_options = {
|
166 |
+
"document_type": doc_type_for_preprocessing,
|
167 |
+
"grayscale": grayscale,
|
168 |
+
"denoise": denoise,
|
169 |
+
"contrast": contrast,
|
170 |
+
"rotation": rotation
|
171 |
+
}
|
172 |
+
|
173 |
+
# PDF-specific options
|
174 |
+
st.markdown("### PDF Options")
|
175 |
+
max_pages = st.number_input("Maximum Pages to Process",
|
176 |
+
min_value=1,
|
177 |
+
max_value=20,
|
178 |
+
value=DEFAULT_MAX_PAGES,
|
179 |
+
help="Limit the number of pages to process (for multi-page PDFs)")
|
180 |
+
|
181 |
+
# Set default values for removed options
|
182 |
+
pdf_dpi = DEFAULT_PDF_DPI
|
183 |
+
pdf_rotation = 0
|
184 |
+
|
185 |
+
# Create options dictionary
|
186 |
+
options = {
|
187 |
+
"use_vision": use_vision,
|
188 |
+
"perf_mode": "Quality", # Default to Quality, removed performance mode option
|
189 |
+
"pdf_dpi": pdf_dpi,
|
190 |
+
"max_pages": max_pages,
|
191 |
+
"pdf_rotation": pdf_rotation,
|
192 |
+
"custom_prompt": custom_prompt,
|
193 |
+
"preprocessing_options": preprocessing_options,
|
194 |
+
"use_segmentation": use_segmentation if 'use_segmentation' in locals() else False
|
195 |
+
}
|
196 |
+
|
197 |
+
return options
|
198 |
+
|
199 |
+
def create_file_uploader():
|
200 |
+
"""Create and return a file uploader"""
|
201 |
+
# Add app description
|
202 |
+
st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><div style="font-size: 32px;">📜</div><div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical OCR</h2></div></div>', unsafe_allow_html=True)
|
203 |
+
st.markdown("<p style='font-size: 0.8em; color: #666; text-align: left;'>Made possible by Mistral AI</p>", unsafe_allow_html=True)
|
204 |
+
|
205 |
+
# Add project framing
|
206 |
+
st.markdown("""
|
207 |
+
This tool assists scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy, it helps navigate:
|
208 |
+
- **Historical newspapers** with complex layouts
|
209 |
+
- **Handwritten documents** from various periods
|
210 |
+
- **Photos of archival materials**
|
211 |
+
|
212 |
+
Upload a document to begin, or explore the examples.
|
213 |
+
""")
|
214 |
+
|
215 |
+
# Create file uploader with a more concise label
|
216 |
+
uploaded_file = st.file_uploader(
|
217 |
+
"Select file",
|
218 |
+
type=["pdf", "png", "jpg"],
|
219 |
+
help="Upload a PDF or image file for OCR processing"
|
220 |
+
)
|
221 |
+
return uploaded_file
|
222 |
+
|
223 |
+
def display_document_with_images(result):
|
224 |
+
"""Display document with images"""
|
225 |
+
# Check for pages_data first
|
226 |
+
if 'pages_data' in result and result['pages_data']:
|
227 |
+
pages_data = result['pages_data']
|
228 |
+
# If pages_data not available, try to extract from raw_response_data
|
229 |
+
elif 'raw_response_data' in result and isinstance(result['raw_response_data'], dict) and 'pages' in result['raw_response_data']:
|
230 |
+
# Build pages_data from raw_response_data
|
231 |
+
pages_data = []
|
232 |
+
raw_pages = result['raw_response_data']['pages']
|
233 |
+
|
234 |
+
for page_idx, page in enumerate(raw_pages):
|
235 |
+
if not isinstance(page, dict):
|
236 |
+
continue
|
237 |
+
|
238 |
+
page_data = {
|
239 |
+
'page_number': page_idx + 1,
|
240 |
+
'markdown': page.get('markdown', ''),
|
241 |
+
'images': []
|
242 |
+
}
|
243 |
+
|
244 |
+
# Extract images if present
|
245 |
+
if 'images' in page and isinstance(page['images'], list):
|
246 |
+
for img_idx, img in enumerate(page['images']):
|
247 |
+
if isinstance(img, dict) and ('base64' in img or 'image_base64' in img):
|
248 |
+
img_base64 = img.get('image_base64', img.get('base64', ''))
|
249 |
+
if img_base64:
|
250 |
+
page_data['images'].append({
|
251 |
+
'id': img.get('id', f"img_{page_idx}_{img_idx}"),
|
252 |
+
'image_base64': img_base64
|
253 |
+
})
|
254 |
+
|
255 |
+
if page_data['markdown'] or page_data['images']:
|
256 |
+
pages_data.append(page_data)
|
257 |
+
else:
|
258 |
+
st.info("No image data available.")
|
259 |
+
return
|
260 |
+
|
261 |
+
# Display each page
|
262 |
+
for i, page_data in enumerate(pages_data):
|
263 |
+
st.markdown(f"### Page {i+1}")
|
264 |
+
|
265 |
+
# Display only the image (removed text column)
|
266 |
+
# Display the image - check multiple possible field names
|
267 |
+
image_displayed = False
|
268 |
+
|
269 |
+
# Try 'image_data' field first
|
270 |
+
if 'image_data' in page_data:
|
271 |
+
try:
|
272 |
+
# Convert base64 to image
|
273 |
+
image_data = base64.b64decode(page_data['image_data'])
|
274 |
+
st.image(io.BytesIO(image_data), use_container_width=True)
|
275 |
+
image_displayed = True
|
276 |
+
except Exception as e:
|
277 |
+
st.error(f"Error displaying image from image_data: {str(e)}")
|
278 |
+
|
279 |
+
# Try 'images' array if image_data didn't work
|
280 |
+
if not image_displayed and 'images' in page_data and len(page_data['images']) > 0:
|
281 |
+
for img in page_data['images']:
|
282 |
+
if 'image_base64' in img:
|
283 |
+
try:
|
284 |
+
st.image(img['image_base64'], use_container_width=True)
|
285 |
+
image_displayed = True
|
286 |
+
break
|
287 |
+
except Exception as e:
|
288 |
+
st.error(f"Error displaying image from images array: {str(e)}")
|
289 |
+
|
290 |
+
# Try alternative image source if still not displayed
|
291 |
+
if not image_displayed and 'raw_response_data' in result:
|
292 |
+
raw_data = result['raw_response_data']
|
293 |
+
if isinstance(raw_data, dict) and 'pages' in raw_data:
|
294 |
+
for raw_page in raw_data['pages']:
|
295 |
+
if isinstance(raw_page, dict) and 'images' in raw_page:
|
296 |
+
for img in raw_page['images']:
|
297 |
+
if isinstance(img, dict) and 'base64' in img:
|
298 |
+
st.image(img['base64'], use_container_width=True)
|
299 |
+
st.caption("Image from OCR response")
|
300 |
+
image_displayed = True
|
301 |
+
break
|
302 |
+
if image_displayed:
|
303 |
+
break
|
304 |
+
|
305 |
+
if not image_displayed:
|
306 |
+
st.info("No image available for this page.")
|
307 |
+
|
308 |
+
# Extract and display alt text if available
|
309 |
+
page_text = ""
|
310 |
+
if 'text' in page_data:
|
311 |
+
page_text = page_data['text']
|
312 |
+
elif 'markdown' in page_data:
|
313 |
+
page_text = page_data['markdown']
|
314 |
+
|
315 |
+
if page_text and page_text.startswith("![") and page_text.endswith(")"):
|
316 |
+
try:
|
317 |
+
alt_text = page_text[2:page_text.index(']')]
|
318 |
+
if alt_text and len(alt_text) > 5: # Only show if alt text is meaningful
|
319 |
+
st.caption(f"Image description: {alt_text}")
|
320 |
+
except:
|
321 |
+
pass
|
322 |
+
|
323 |
+
def display_previous_results():
|
324 |
+
"""Display previous results tab content in a simplified, structured view"""
|
325 |
+
|
326 |
+
# Use a simple header without the button column
|
327 |
+
st.header("Previous Results")
|
328 |
+
|
329 |
+
# Display previous results if available
|
330 |
+
if not st.session_state.previous_results:
|
331 |
+
st.markdown("""
|
332 |
+
<div style="text-align: center; padding: 30px 20px; background-color: #f8f9fa; border-radius: 6px; margin-top: 10px;">
|
333 |
+
<div style="font-size: 36px; margin-bottom: 15px;">📄</div>
|
334 |
+
<h3="margin-bottom: 16px; font-weight: 500;">No Previous Results</h3>
|
335 |
+
<p style="font-size: 14px; color: #666;">Process a document to see your results history.</p>
|
336 |
+
</div>
|
337 |
+
""", unsafe_allow_html=True)
|
338 |
+
else:
|
339 |
+
# Prepare zip download outside of the UI flow
|
340 |
+
try:
|
341 |
+
# Create download button for all results
|
342 |
+
from utils.image_utils import create_results_zip_in_memory
|
343 |
+
zip_data = create_results_zip_in_memory(st.session_state.previous_results)
|
344 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
345 |
+
|
346 |
+
# Simplified filename
|
347 |
+
zip_filename = f"ocr_results_{timestamp}.zip"
|
348 |
+
|
349 |
+
# Encode the zip data for direct download link
|
350 |
+
zip_b64 = base64.b64encode(zip_data).decode()
|
351 |
+
|
352 |
+
# Add styled download tag in the metadata section
|
353 |
+
download_html = '<div style="display: flex; align-items: center; margin: 0.5rem 0; flex-wrap: wrap;">'
|
354 |
+
download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>'
|
355 |
+
download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">All Results</a>'
|
356 |
+
download_html += '</div>'
|
357 |
+
st.markdown(download_html, unsafe_allow_html=True)
|
358 |
+
except Exception:
|
359 |
+
# Silent fail - no error message to keep UI clean
|
360 |
+
pass
|
361 |
+
|
362 |
+
# Create a cleaner, more minimal grid for results using Streamlit columns
|
363 |
+
# Calculate number of columns based on screen width - more responsive
|
364 |
+
num_columns = 2 # Two columns for most screens
|
365 |
+
|
366 |
+
# Create rows of result cards
|
367 |
+
for i in range(0, len(st.session_state.previous_results), num_columns):
|
368 |
+
# Create a row of columns
|
369 |
+
cols = st.columns(num_columns)
|
370 |
+
|
371 |
+
# Fill each column with a result card
|
372 |
+
for j in range(num_columns):
|
373 |
+
index = i + j
|
374 |
+
if index < len(st.session_state.previous_results):
|
375 |
+
result = st.session_state.previous_results[index]
|
376 |
+
|
377 |
+
# Get basic info for the card
|
378 |
+
file_name = result.get("file_name", f"Document {index+1}")
|
379 |
+
timestamp = result.get("timestamp", "")
|
380 |
+
|
381 |
+
# Determine file type icon
|
382 |
+
if file_name.lower().endswith(".pdf"):
|
383 |
+
icon = "📄"
|
384 |
+
elif any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif"]):
|
385 |
+
icon = "🖼️"
|
386 |
+
else:
|
387 |
+
icon = "📝"
|
388 |
+
|
389 |
+
# Display a simplified card in each column
|
390 |
+
with cols[j]:
|
391 |
+
# Use a container for better styling control
|
392 |
+
with st.container():
|
393 |
+
# Create visually cleaner card with less vertical space
|
394 |
+
st.markdown(f"""
|
395 |
+
<div style="padding: 10px; border: 1px solid #e0e0e0; border-radius: 6px; margin-bottom: 10px;">
|
396 |
+
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 5px;">
|
397 |
+
<div style="font-weight: 500; font-size: 14px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">{icon} {file_name}</div>
|
398 |
+
<div style="color: #666; font-size: 12px;">{timestamp.split()[0] if timestamp else ""}</div>
|
399 |
+
</div>
|
400 |
+
</div>
|
401 |
+
""", unsafe_allow_html=True)
|
402 |
+
|
403 |
+
# Add a simple button below each card
|
404 |
+
if st.button(f"View", key=f"view_{index}", help=f"View {file_name}"):
|
405 |
+
st.session_state.selected_previous_result = st.session_state.previous_results[index]
|
406 |
+
st.rerun()
|
407 |
+
|
408 |
+
# Display the selected result if available
|
409 |
+
if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
|
410 |
+
selected_result = st.session_state.selected_previous_result
|
411 |
+
|
412 |
+
# Draw a separator between results list and selected document
|
413 |
+
st.markdown("<hr style='margin: 20px 0 15px 0; border: none; height: 1px; background-color: #eee;'>", unsafe_allow_html=True)
|
414 |
+
|
415 |
+
# Create a cleaner header for the selected document
|
416 |
+
file_name = selected_result.get('file_name', 'Document')
|
417 |
+
st.subheader(f"{file_name}")
|
418 |
+
|
419 |
+
# Add a simple back button at the top
|
420 |
+
if st.button("← Back to Results", key="back_to_results"):
|
421 |
+
if 'selected_previous_result' in st.session_state:
|
422 |
+
del st.session_state.selected_previous_result
|
423 |
+
st.session_state.perform_reset = True
|
424 |
+
st.rerun()
|
425 |
+
|
426 |
+
# Simplified metadata display - just one line with essential info
|
427 |
+
meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 12px; margin: 8px 0 15px 0; font-size: 14px; color: #666;">'
|
428 |
+
|
429 |
+
# Add timestamp
|
430 |
+
if 'timestamp' in selected_result:
|
431 |
+
meta_html += f'<div>{selected_result["timestamp"]}</div>'
|
432 |
+
|
433 |
+
# Add languages if available (simplified)
|
434 |
+
if 'languages' in selected_result and selected_result['languages']:
|
435 |
+
languages = [lang for lang in selected_result['languages'] if lang is not None]
|
436 |
+
if languages:
|
437 |
+
meta_html += f'<div>Language: {", ".join(languages)}</div>'
|
438 |
+
|
439 |
+
# Add page count if available (simplified)
|
440 |
+
if 'limited_pages' in selected_result:
|
441 |
+
meta_html += f'<div>Pages: {selected_result["limited_pages"]["processed"]}/{selected_result["limited_pages"]["total"]}</div>'
|
442 |
+
|
443 |
+
meta_html += '</div>'
|
444 |
+
st.markdown(meta_html, unsafe_allow_html=True)
|
445 |
+
|
446 |
+
# Simplified tabs - using the same format as main view
|
447 |
+
has_images = selected_result.get('has_images', False)
|
448 |
+
if has_images:
|
449 |
+
view_tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
|
450 |
+
view_tab1, view_tab2, view_tab3 = view_tabs
|
451 |
+
else:
|
452 |
+
view_tabs = st.tabs(["Document Content", "Raw JSON"])
|
453 |
+
view_tab1, view_tab2 = view_tabs
|
454 |
+
view_tab3 = None
|
455 |
+
|
456 |
+
# First tab - Document Content (simplified structured view)
|
457 |
+
with view_tab1:
|
458 |
+
# Display content in a cleaner, more streamlined format
|
459 |
+
if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
|
460 |
+
# Create a more focused list of important sections
|
461 |
+
priority_sections = ["title", "content", "transcript", "summary"]
|
462 |
+
displayed_sections = set()
|
463 |
+
|
464 |
+
# First display priority sections
|
465 |
+
for section in priority_sections:
|
466 |
+
if section in selected_result['ocr_contents'] and selected_result['ocr_contents'][section]:
|
467 |
+
content = selected_result['ocr_contents'][section]
|
468 |
+
if isinstance(content, str) and content.strip():
|
469 |
+
# Only add a subheader for meaningful section names, not raw_text
|
470 |
+
if section != "raw_text":
|
471 |
+
st.markdown(f"##### {section.replace('_', ' ').title()}")
|
472 |
+
|
473 |
+
# Format and display content
|
474 |
+
formatted_content = format_ocr_text(content, for_display=True)
|
475 |
+
st.markdown(formatted_content)
|
476 |
+
displayed_sections.add(section)
|
477 |
+
|
478 |
+
# Then display any remaining sections not already shown
|
479 |
+
for section, content in selected_result['ocr_contents'].items():
|
480 |
+
if (section not in displayed_sections and
|
481 |
+
section not in ['error', 'partial_text'] and
|
482 |
+
content):
|
483 |
+
st.markdown(f"##### {section.replace('_', ' ').title()}")
|
484 |
+
|
485 |
+
if isinstance(content, str):
|
486 |
+
st.markdown(format_ocr_text(content, for_display=True))
|
487 |
+
elif isinstance(content, list):
|
488 |
+
for item in content:
|
489 |
+
st.markdown(f"- {item}")
|
490 |
+
elif isinstance(content, dict):
|
491 |
+
for k, v in content.items():
|
492 |
+
st.markdown(f"**{k}:** {v}")
|
493 |
+
|
494 |
+
# Second tab - Raw JSON (simplified)
|
495 |
+
with view_tab2:
|
496 |
+
# Extract the relevant JSON data
|
497 |
+
json_data = {}
|
498 |
+
|
499 |
+
# Include important metadata
|
500 |
+
for field in ['file_name', 'timestamp', 'processing_time', 'title', 'languages', 'topics', 'subjects', 'text',' raw_text']:
|
501 |
+
if field in selected_result:
|
502 |
+
json_data[field] = selected_result[field]
|
503 |
+
|
504 |
+
# Include OCR contents
|
505 |
+
if 'ocr_contents' in selected_result:
|
506 |
+
json_data['ocr_contents'] = selected_result['ocr_contents']
|
507 |
+
|
508 |
+
# Format the JSON prettily
|
509 |
+
json_str = json.dumps(json_data, indent=2)
|
510 |
+
|
511 |
+
# Display in a monospace font with syntax highlighting
|
512 |
+
st.code(json_str, language="json")
|
513 |
+
|
514 |
+
# Third tab - Images (simplified)
|
515 |
+
if has_images and view_tab3 is not None:
|
516 |
+
with view_tab3:
|
517 |
+
# Simplified image display
|
518 |
+
if 'pages_data' in selected_result:
|
519 |
+
for i, page_data in enumerate(selected_result['pages_data']):
|
520 |
+
# Display each page
|
521 |
+
if 'images' in page_data and len(page_data['images']) > 0:
|
522 |
+
for img in page_data['images']:
|
523 |
+
if 'image_base64' in img:
|
524 |
+
st.image(img['image_base64'], use_container_width=True)
|
525 |
+
|
526 |
+
# Get page text if available
|
527 |
+
page_text = ""
|
528 |
+
if 'markdown' in page_data:
|
529 |
+
page_text = page_data['markdown']
|
530 |
+
|
531 |
+
# Display text if available
|
532 |
+
if page_text:
|
533 |
+
with st.expander(f"Page {i+1} Text", expanded=False):
|
534 |
+
st.text(page_text)
|
535 |
+
|
536 |
+
def display_about_tab():
|
537 |
+
"""Display learn more tab content"""
|
538 |
+
st.header("Learn More")
|
539 |
+
|
540 |
+
# Add app description
|
541 |
+
st.markdown("""
|
542 |
+
**Historical OCR** is a tailored academic tool for extracting text from historical documents, manuscripts, and printed materials.
|
543 |
+
""")
|
544 |
+
|
545 |
+
# Purpose section with consistent formatting
|
546 |
+
st.markdown("### Purpose")
|
547 |
+
st.markdown("""
|
548 |
+
This tool is designed to assist scholars in historical research by extracting text from challenging documents.
|
549 |
+
While it may not achieve full accuracy for all materials, it serves as a tailored research aid for navigating
|
550 |
+
historical documents, particularly:
|
551 |
+
""")
|
552 |
+
|
553 |
+
st.markdown("""
|
554 |
+
- **Historical newspapers** with complex layouts and aged text
|
555 |
+
- **Handwritten documents** from various time periods
|
556 |
+
- **Photos of archival materials** that may be difficult to read
|
557 |
+
""")
|
558 |
+
|
559 |
+
# Features section with consistent formatting
|
560 |
+
st.markdown("### Features")
|
561 |
+
st.markdown("""
|
562 |
+
- **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
|
563 |
+
- **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
|
564 |
+
- **Editable Results**: Review and edit extracted text directly in the interface
|
565 |
+
- **Structured Content Analysis**: Automatic organization of document content
|
566 |
+
- **Multi-language Support**: Process documents in various languages
|
567 |
+
- **PDF Processing**: Handle multi-page historical documents
|
568 |
+
""")
|
569 |
+
|
570 |
+
# How to Use section with consistent formatting
|
571 |
+
st.markdown("### How to Use")
|
572 |
+
st.markdown("""
|
573 |
+
1. Upload a document (PDF or image)
|
574 |
+
2. Select the document type and adjust preprocessing options if needed
|
575 |
+
3. Add custom processing instructions for specialized documents
|
576 |
+
4. Process the document
|
577 |
+
5. Review, edit, and download the results
|
578 |
+
""")
|
579 |
+
|
580 |
+
# Technologies section with consistent formatting
|
581 |
+
st.markdown("### Technologies")
|
582 |
+
st.markdown("""
|
583 |
+
- OCR processing using Mistral AI's advanced document understanding capabilities
|
584 |
+
- Image preprocessing with OpenCV
|
585 |
+
- PDF handling with pdf2image
|
586 |
+
- Web interface with Streamlit
|
587 |
+
""")
|
588 |
+
|
589 |
+
# Add version information
|
590 |
+
st.markdown("**Version:** 2.0.0")
|
utils/helpers/language_detection.py
ADDED
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Standard library imports
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
from typing import List, Dict, Set, Tuple, Optional, Union, Any
|
5 |
+
from functools import lru_cache
|
6 |
+
|
7 |
+
# Configure logging
|
8 |
+
logging.basicConfig(level=logging.INFO,
|
9 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
class LanguageDetector:
|
13 |
+
"""
|
14 |
+
A language detection system that provides balanced detection across multiple languages
|
15 |
+
using an enhanced statistical approach.
|
16 |
+
"""
|
17 |
+
|
18 |
+
def __init__(self):
|
19 |
+
"""Initialize the language detector with statistical language models"""
|
20 |
+
logger.info("Initializing language detector with statistical models")
|
21 |
+
|
22 |
+
# Initialize language indicators dictionary for statistical detection
|
23 |
+
self._init_language_indicators()
|
24 |
+
# Set thresholds for language detection confidence
|
25 |
+
self.single_lang_confidence = 65 # Minimum score to consider a language detected
|
26 |
+
self.secondary_lang_threshold = 0.75 # Secondary language must be at least this fraction of primary score
|
27 |
+
|
28 |
+
def _init_language_indicators(self):
|
29 |
+
"""Initialize language indicators for statistical detection with historical markers"""
|
30 |
+
# Define indicators for all supported languages with equal detail level
|
31 |
+
# Each language has:
|
32 |
+
# - Distinctive characters
|
33 |
+
# - Common words (including historical forms)
|
34 |
+
# - N-grams (character sequences)
|
35 |
+
# - Historical markers specific to older forms of the language
|
36 |
+
self.language_indicators = {
|
37 |
+
"English": {
|
38 |
+
"chars": [], # English uses basic Latin alphabet without special chars
|
39 |
+
"words": ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it',
|
40 |
+
'with', 'as', 'be', 'on', 'by', 'at', 'this', 'have', 'from', 'or',
|
41 |
+
'an', 'but', 'not', 'what', 'all', 'were', 'when', 'we', 'there', 'can',
|
42 |
+
'would', 'who', 'you', 'been', 'one', 'their', 'has', 'more', 'if', 'no'],
|
43 |
+
"ngrams": ['th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd', 'ti', 'es', 'or',
|
44 |
+
'ing', 'tion', 'the', 'and', 'tha', 'ent', 'ion'],
|
45 |
+
"historical": {
|
46 |
+
"chars": ['þ', 'ȝ', 'æ', 'ſ'], # Thorn, yogh, ash, long s
|
47 |
+
"words": ['thou', 'thee', 'thy', 'thine', 'hath', 'doth', 'ere', 'whilom', 'betwixt',
|
48 |
+
'ye', 'art', 'wast', 'dost', 'hast', 'shalt', 'mayst', 'verily'],
|
49 |
+
"patterns": ['eth$', '^y[^a-z]', 'ck$', 'aught', 'ought'] # -eth endings, y- prefixes
|
50 |
+
}
|
51 |
+
},
|
52 |
+
"French": {
|
53 |
+
"chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û', 'ë', 'ï', 'ü'],
|
54 |
+
"words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une', 'ce', 'cette',
|
55 |
+
'ces', 'dans', 'par', 'pour', 'sur', 'qui', 'que', 'quoi', 'où', 'quand', 'comment',
|
56 |
+
'est', 'sont', 'ont', 'nous', 'vous', 'ils', 'elles', 'avec', 'sans', 'mais', 'ou'],
|
57 |
+
"ngrams": ['es', 'le', 'de', 'en', 'on', 'nt', 'qu', 'ai', 'an', 'ou', 'ur', 're', 'me',
|
58 |
+
'les', 'ent', 'que', 'des', 'ons', 'ant', 'ion'],
|
59 |
+
"historical": {
|
60 |
+
"chars": ['ſ', 'æ', 'œ'], # Long s and ligatures
|
61 |
+
"words": ['aultre', 'avecq', 'icelluy', 'oncques', 'moult', 'estre', 'mesme', 'ceste',
|
62 |
+
'ledict', 'celuy', 'ceulx', 'aulcun', 'ainſi', 'touſiours', 'eſtre',
|
63 |
+
'eſt', 'meſme', 'felon', 'auec', 'iufques', 'chofe', 'fcience'],
|
64 |
+
"patterns": ['oi[ts]$', 'oi[re]$', 'f[^aeiou]', 'ff', 'ſ', 'auoit', 'eſtoit',
|
65 |
+
'ſi', 'ſur', 'ſa', 'cy', 'ayant', 'oy', 'uſ', 'auſ']
|
66 |
+
},
|
67 |
+
},
|
68 |
+
"German": {
|
69 |
+
"chars": ['ä', 'ö', 'ü', 'ß'],
|
70 |
+
"words": ['der', 'die', 'das', 'und', 'in', 'zu', 'den', 'ein', 'eine', 'mit', 'ist', 'von',
|
71 |
+
'des', 'sich', 'auf', 'für', 'als', 'auch', 'werden', 'bei', 'durch', 'aus', 'sind',
|
72 |
+
'nicht', 'nur', 'wurde', 'wie', 'wenn', 'aber', 'noch', 'nach', 'so', 'sein', 'über'],
|
73 |
+
"ngrams": ['en', 'er', 'ch', 'de', 'ei', 'in', 'te', 'nd', 'ie', 'ge', 'un', 'sch', 'ich',
|
74 |
+
'den', 'die', 'und', 'der', 'ein', 'ung', 'cht'],
|
75 |
+
"historical": {
|
76 |
+
"chars": ['ſ', 'ů', 'ė', 'ÿ'],
|
77 |
+
"words": ['vnnd', 'vnnd', 'vnter', 'vnd', 'seyn', 'thun', 'auff', 'auß', 'deß', 'diß'],
|
78 |
+
"patterns": ['^v[nd]', 'th', 'vnter', 'ſch']
|
79 |
+
}
|
80 |
+
},
|
81 |
+
"Spanish": {
|
82 |
+
"chars": ['á', 'é', 'í', 'ó', 'ú', 'ñ', 'ü', '¿', '¡'],
|
83 |
+
"words": ['el', 'la', 'los', 'las', 'de', 'en', 'y', 'a', 'que', 'por', 'un', 'una', 'no',
|
84 |
+
'es', 'con', 'para', 'su', 'al', 'se', 'del', 'como', 'más', 'pero', 'lo', 'mi',
|
85 |
+
'si', 'ya', 'todo', 'esta', 'cuando', 'hay', 'muy', 'bien', 'sin', 'así'],
|
86 |
+
"ngrams": ['de', 'en', 'os', 'es', 'la', 'ar', 'el', 'er', 'ra', 'as', 'an', 'do', 'or',
|
87 |
+
'que', 'nte', 'los', 'ado', 'con', 'ent', 'ien'],
|
88 |
+
"historical": {
|
89 |
+
"chars": ['ſ', 'ç', 'ñ'],
|
90 |
+
"words": ['facer', 'fijo', 'fermoso', 'agora', 'asaz', 'aver', 'caſa', 'deſde', 'eſte',
|
91 |
+
'eſta', 'eſto', 'deſto', 'deſta', 'eſſo', 'muger', 'dixo', 'fazer'],
|
92 |
+
"patterns": ['^f[aei]', 'ſſ', 'ſc', '^deſ', 'xo$', 'xe$']
|
93 |
+
},
|
94 |
+
},
|
95 |
+
"Italian": {
|
96 |
+
"chars": ['à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'],
|
97 |
+
"words": ['il', 'la', 'i', 'le', 'e', 'di', 'a', 'in', 'che', 'non', 'per', 'con', 'un',
|
98 |
+
'una', 'del', 'della', 'è', 'sono', 'da', 'si', 'come', 'anche', 'più', 'ma', 'ci',
|
99 |
+
'se', 'ha', 'mi', 'lo', 'ti', 'al', 'tu', 'questo', 'questi'],
|
100 |
+
"ngrams": ['di', 'la', 'er', 'to', 're', 'co', 'de', 'in', 'ra', 'on', 'li', 'no', 'ri',
|
101 |
+
'che', 'ent', 'con', 'per', 'ion', 'ato', 'lla']
|
102 |
+
},
|
103 |
+
"Portuguese": {
|
104 |
+
"chars": ['á', 'â', 'ã', 'à', 'é', 'ê', 'í', 'ó', 'ô', 'õ', 'ú', 'ç'],
|
105 |
+
"words": ['o', 'a', 'os', 'as', 'de', 'em', 'e', 'do', 'da', 'dos', 'das', 'no', 'na',
|
106 |
+
'para', 'que', 'um', 'uma', 'por', 'com', 'se', 'não', 'mais', 'como', 'mas',
|
107 |
+
'você', 'eu', 'este', 'isso', 'ele', 'seu', 'sua', 'ou', 'já', 'me'],
|
108 |
+
"ngrams": ['de', 'os', 'em', 'ar', 'es', 'ra', 'do', 'da', 'en', 'co', 'nt', 'ad', 'to',
|
109 |
+
'que', 'nto', 'ent', 'com', 'ção', 'ado', 'ment']
|
110 |
+
},
|
111 |
+
"Dutch": {
|
112 |
+
"chars": ['ë', 'ï', 'ö', 'ü', 'é', 'è', 'ê', 'ç', 'á', 'à', 'ä', 'ó', 'ô', 'ú', 'ù', 'û', 'ij'],
|
113 |
+
"words": ['de', 'het', 'een', 'en', 'van', 'in', 'is', 'dat', 'op', 'te', 'zijn', 'met',
|
114 |
+
'voor', 'niet', 'aan', 'er', 'die', 'maar', 'dan', 'ik', 'je', 'hij', 'zij', 'we',
|
115 |
+
'kunnen', 'wordt', 'nog', 'door', 'over', 'als', 'uit', 'bij', 'om', 'ook'],
|
116 |
+
"ngrams": ['en', 'de', 'er', 'ee', 'ge', 'an', 'aa', 'in', 'te', 'et', 'ng', 'ee', 'or',
|
117 |
+
'van', 'het', 'een', 'ing', 'ver', 'den', 'sch']
|
118 |
+
},
|
119 |
+
"Russian": {
|
120 |
+
# Russian (Cyrillic alphabet) characters
|
121 |
+
"chars": ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
|
122 |
+
'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'],
|
123 |
+
"words": ['и', 'в', 'не', 'на', 'что', 'я', 'с', 'а', 'то', 'он', 'как', 'этот', 'по',
|
124 |
+
'но', 'из', 'к', 'у', 'за', 'вы', 'все', 'так', 'же', 'от', 'для', 'о', 'его',
|
125 |
+
'мы', 'было', 'она', 'бы', 'мне', 'еще', 'есть', 'быть', 'был'],
|
126 |
+
"ngrams": ['о', 'е', 'а', 'н', 'и', 'т', 'р', 'с', 'в', 'л', 'к', 'м', 'д',
|
127 |
+
'ст', 'но', 'то', 'ни', 'на', 'по', 'ет']
|
128 |
+
},
|
129 |
+
"Chinese": {
|
130 |
+
"chars": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
|
131 |
+
'个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就',
|
132 |
+
'年', '生', '对', '能', '自', '那', '都', '得', '说', '过', '子', '家', '后', '多'],
|
133 |
+
# Chinese doesn't have "words" in the same way as alphabetic languages
|
134 |
+
"words": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
|
135 |
+
'个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就'],
|
136 |
+
"ngrams": ['的', '是', '不', '了', '在', '我', '有', '和', '人', '这', '中', '大', '来', '上',
|
137 |
+
'国', '个', '到', '说', '们', '为']
|
138 |
+
},
|
139 |
+
"Japanese": {
|
140 |
+
# A mix of hiragana, katakana, and common kanji
|
141 |
+
"chars": ['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ',
|
142 |
+
'ア', 'イ', 'ウ', 'エ', 'オ', 'カ', 'キ', 'ク', 'ケ', 'コ', 'サ', 'シ', 'ス', 'セ', 'ソ',
|
143 |
+
'日', '本', '人', '大', '小', '中', '山', '川', '田', '子', '女', '男', '月', '火', '水'],
|
144 |
+
"words": ['は', 'を', 'に', 'の', 'が', 'で', 'へ', 'から', 'より', 'まで', 'だ', 'です', 'した',
|
145 |
+
'ます', 'ません', 'です', 'これ', 'それ', 'あれ', 'この', 'その', 'あの', 'わたし'],
|
146 |
+
"ngrams": ['の', 'は', 'た', 'が', 'を', 'に', 'て', 'で', 'と', 'し', 'か', 'ま', 'こ', 'い',
|
147 |
+
'する', 'いる', 'れる', 'なる', 'れて', 'した']
|
148 |
+
},
|
149 |
+
"Korean": {
|
150 |
+
"chars": ['가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하',
|
151 |
+
'그', '는', '을', '이', '에', '에서', '로', '으로', '와', '과', '또는', '하지만'],
|
152 |
+
"words": ['이', '그', '저', '나', '너', '우리', '그들', '이것', '그것', '저것', '은', '는',
|
153 |
+
'이', '가', '을', '를', '에', '에서', '으로', '로', '와', '과', '의', '하다', '되다'],
|
154 |
+
"ngrams": ['이', '다', '는', '에', '하', '고', '지', '서', '의', '가', '을', '로', '을', '으',
|
155 |
+
'니다', '습니', '하는', '이다', '에서', '하고']
|
156 |
+
},
|
157 |
+
"Arabic": {
|
158 |
+
"chars": ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض',
|
159 |
+
'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'ء', 'ة', 'ى'],
|
160 |
+
"words": ['في', 'من', 'على', 'إلى', 'هذا', 'هذه', 'ذلك', 'تلك', 'هو', 'هي', 'هم', 'أنا',
|
161 |
+
'أنت', 'نحن', 'كان', 'كانت', 'يكون', 'لا', 'لم', 'ما', 'أن', 'و', 'أو', 'ثم', 'بعد'],
|
162 |
+
"ngrams": ['ال', 'ان', 'في', 'من', 'ون', 'ين', 'ات', 'ار', 'ور', 'ما', 'لا', 'ها', 'ان',
|
163 |
+
'الم', 'لان', 'علا', 'الح', 'الس', 'الع', 'الت']
|
164 |
+
},
|
165 |
+
"Hindi": {
|
166 |
+
"chars": ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ',
|
167 |
+
'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न',
|
168 |
+
'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी',
|
169 |
+
'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्', 'ं', 'ः'],
|
170 |
+
"words": ['और', 'का', 'के', 'की', 'एक', 'में', 'है', 'यह', 'हैं', 'से', 'को', 'पर', 'इस',
|
171 |
+
'हो', 'गया', 'कर', 'मैं', 'या', 'हुआ', 'था', 'वह', 'अपने', 'सकता', 'ने', 'बहुत'],
|
172 |
+
"ngrams": ['का', 'के', 'की', 'है', 'ने', 'से', 'मे', 'को', 'पर', 'हा', 'रा', 'ता', 'या',
|
173 |
+
'ार', 'ान', 'कार', 'राज', 'ारा', 'जाए', 'ेजा']
|
174 |
+
},
|
175 |
+
"Latin": {
|
176 |
+
"chars": [], # Latin uses basic Latin alphabet
|
177 |
+
"words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod', 'ut', 'si',
|
178 |
+
'nec', 'ex', 'per', 'quam', 'pro', 'iam', 'hoc', 'aut', 'esse', 'enim', 'de',
|
179 |
+
'atque', 'ac', 'ante', 'post', 'sub', 'ab'],
|
180 |
+
"ngrams": ['us', 'is', 'um', 'er', 'it', 'nt', 'am', 'em', 're', 'at', 'ti', 'es', 'ur',
|
181 |
+
'tur', 'que', 'ere', 'ent', 'ius', 'rum', 'tus']
|
182 |
+
},
|
183 |
+
"Greek": {
|
184 |
+
"chars": ['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π',
|
185 |
+
'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'ά', 'έ', 'ή', 'ί', 'ό', 'ύ', 'ώ'],
|
186 |
+
"words": ['και', 'του', 'της', 'των', 'στο', 'στη', 'με', 'από', 'για', 'είναι', 'να',
|
187 |
+
'ότι', 'δεν', 'στον', 'μια', 'που', 'ένα', 'έχει', 'θα', 'το', 'ο', 'η', 'τον'],
|
188 |
+
"ngrams": ['αι', 'τα', 'ου', 'τη', 'οι', 'το', 'ης', 'αν', 'ος', 'ον', 'ις', 'ει', 'ερ',
|
189 |
+
'και', 'την', 'τον', 'ους', 'νου', 'εντ', 'μεν']
|
190 |
+
}
|
191 |
+
}
|
192 |
+
|
193 |
+
def detect_languages(self, text: str, filename: str = None, current_languages: List[str] = None) -> List[str]:
|
194 |
+
"""
|
195 |
+
Detect languages in text using an enhanced statistical approach
|
196 |
+
|
197 |
+
Args:
|
198 |
+
text: Text to analyze
|
199 |
+
filename: Optional filename to provide additional context
|
200 |
+
current_languages: Optional list of languages already detected
|
201 |
+
|
202 |
+
Returns:
|
203 |
+
List of detected languages
|
204 |
+
"""
|
205 |
+
logger = logging.getLogger("language_detector")
|
206 |
+
|
207 |
+
# If no text provided, return current languages or default
|
208 |
+
if not text or len(text.strip()) < 10:
|
209 |
+
return current_languages if current_languages else ["English"]
|
210 |
+
|
211 |
+
# If we already have detected languages, use them
|
212 |
+
if current_languages and len(current_languages) > 0:
|
213 |
+
logger.info(f"Using already detected languages: {current_languages}")
|
214 |
+
return current_languages
|
215 |
+
|
216 |
+
# Use enhanced statistical detection
|
217 |
+
detected_languages = self._detect_statistically(text, filename)
|
218 |
+
logger.info(f"Statistical language detection results: {detected_languages}")
|
219 |
+
return detected_languages
|
220 |
+
|
221 |
+
def _detect_statistically(self, text: str, filename: str = None) -> List[str]:
|
222 |
+
"""
|
223 |
+
Detect languages using enhanced statistical analysis with historical language indicators
|
224 |
+
|
225 |
+
Args:
|
226 |
+
text: Text to analyze
|
227 |
+
filename: Optional filename for additional context
|
228 |
+
|
229 |
+
Returns:
|
230 |
+
List of detected languages
|
231 |
+
"""
|
232 |
+
logger = logging.getLogger("language_detector")
|
233 |
+
|
234 |
+
# Normalize text to lowercase for consistent analysis
|
235 |
+
text_lower = text.lower()
|
236 |
+
words = re.findall(r'\b\w+\b', text_lower) # Extract words
|
237 |
+
|
238 |
+
# Score each language based on characters, words, n-grams, and historical markers
|
239 |
+
language_scores = {}
|
240 |
+
historical_bonus = {}
|
241 |
+
|
242 |
+
# PHASE 1: Special character analysis
|
243 |
+
# Count special characters for each language
|
244 |
+
special_char_counts = {}
|
245 |
+
total_special_chars = 0
|
246 |
+
|
247 |
+
for language, indicators in self.language_indicators.items():
|
248 |
+
chars = indicators["chars"]
|
249 |
+
count = 0
|
250 |
+
for char in chars:
|
251 |
+
if char in text_lower:
|
252 |
+
count += text_lower.count(char)
|
253 |
+
special_char_counts[language] = count
|
254 |
+
total_special_chars += count
|
255 |
+
|
256 |
+
# Normalize character scores (0-30 points)
|
257 |
+
for language, count in special_char_counts.items():
|
258 |
+
if total_special_chars > 0:
|
259 |
+
# Scale score to 0-30 range (reduced from 35 to make room for historical)
|
260 |
+
normalized_score = (count / total_special_chars) * 30
|
261 |
+
language_scores[language] = normalized_score
|
262 |
+
else:
|
263 |
+
language_scores[language] = 0
|
264 |
+
|
265 |
+
# PHASE 2: Word analysis (0-30 points)
|
266 |
+
# Count common words for each language
|
267 |
+
for language, indicators in self.language_indicators.items():
|
268 |
+
word_list = indicators["words"]
|
269 |
+
word_matches = sum(1 for word in words if word in word_list)
|
270 |
+
|
271 |
+
# Normalize word score based on text length and word list size
|
272 |
+
word_score_factor = min(1.0, word_matches / (len(words) * 0.1)) # Max 1.0 if 10% match
|
273 |
+
language_scores[language] = language_scores.get(language, 0) + (word_score_factor * 30)
|
274 |
+
|
275 |
+
# PHASE 3: N-gram analysis (0-20 points)
|
276 |
+
for language, indicators in self.language_indicators.items():
|
277 |
+
ngram_list = indicators["ngrams"]
|
278 |
+
ngram_matches = 0
|
279 |
+
|
280 |
+
# Count ngram occurrences
|
281 |
+
for ngram in ngram_list:
|
282 |
+
ngram_matches += text_lower.count(ngram)
|
283 |
+
|
284 |
+
# Normalize ngram score based on text length
|
285 |
+
if len(text_lower) > 0:
|
286 |
+
ngram_score_factor = min(1.0, ngram_matches / (len(text_lower) * 0.05)) # Max 1.0 if 5% match
|
287 |
+
language_scores[language] = language_scores.get(language, 0) + (ngram_score_factor * 20)
|
288 |
+
|
289 |
+
# PHASE 4: Historical language markers (0-20 points)
|
290 |
+
for language, indicators in self.language_indicators.items():
|
291 |
+
if "historical" in indicators:
|
292 |
+
historical_indicators = indicators["historical"]
|
293 |
+
historical_score = 0
|
294 |
+
|
295 |
+
# Check for historical chars
|
296 |
+
if "chars" in historical_indicators:
|
297 |
+
for char in historical_indicators["chars"]:
|
298 |
+
if char in text_lower:
|
299 |
+
historical_score += text_lower.count(char) * 0.5
|
300 |
+
|
301 |
+
# Check for historical words
|
302 |
+
if "words" in historical_indicators:
|
303 |
+
hist_words = historical_indicators["words"]
|
304 |
+
hist_word_matches = sum(1 for word in words if word in hist_words)
|
305 |
+
if hist_word_matches > 0:
|
306 |
+
# Historical words are strong indicators
|
307 |
+
historical_score += min(10, hist_word_matches * 2)
|
308 |
+
|
309 |
+
# Check for historical patterns
|
310 |
+
if "patterns" in historical_indicators:
|
311 |
+
for pattern in historical_indicators["patterns"]:
|
312 |
+
matches = len(re.findall(pattern, text_lower))
|
313 |
+
if matches > 0:
|
314 |
+
historical_score += min(5, matches * 0.5)
|
315 |
+
|
316 |
+
# Cap historical score at 20 points
|
317 |
+
historical_score = min(20, historical_score)
|
318 |
+
historical_bonus[language] = historical_score
|
319 |
+
|
320 |
+
# Apply historical bonus
|
321 |
+
language_scores[language] += historical_score
|
322 |
+
|
323 |
+
# Apply language-specific exclusivity multiplier if present
|
324 |
+
if "exclusivity" in indicators:
|
325 |
+
exclusivity = indicators["exclusivity"]
|
326 |
+
language_scores[language] *= exclusivity
|
327 |
+
logger.info(f"Applied exclusivity multiplier {exclusivity} to {language}")
|
328 |
+
|
329 |
+
# Print historical bonus for debugging
|
330 |
+
for language, bonus in historical_bonus.items():
|
331 |
+
if bonus > 0:
|
332 |
+
logger.info(f"Historical language bonus for {language}: {bonus} points")
|
333 |
+
|
334 |
+
# Final language selection with more stringent criteria
|
335 |
+
# Get languages with scores above threshold
|
336 |
+
threshold = self.single_lang_confidence # Higher minimum score
|
337 |
+
candidates = [(lang, score) for lang, score in language_scores.items() if score >= threshold]
|
338 |
+
candidates.sort(key=lambda x: x[1], reverse=True)
|
339 |
+
|
340 |
+
logger.info(f"Language candidates: {candidates}")
|
341 |
+
|
342 |
+
# If we have candidate languages, return top 1-2 with higher threshold for secondary
|
343 |
+
if candidates:
|
344 |
+
# Always take top language
|
345 |
+
result = [candidates[0][0]]
|
346 |
+
|
347 |
+
# Add second language only if it's significantly strong compared to primary
|
348 |
+
# and doesn't have a historical/exclusivity conflict
|
349 |
+
if len(candidates) > 1:
|
350 |
+
primary_lang = candidates[0][0]
|
351 |
+
secondary_lang = candidates[1][0]
|
352 |
+
primary_score = candidates[0][1]
|
353 |
+
secondary_score = candidates[1][1]
|
354 |
+
|
355 |
+
# Only add secondary if it meets threshold and doesn't conflict
|
356 |
+
ratio = secondary_score / primary_score
|
357 |
+
|
358 |
+
# Check for French and Spanish conflict (historical French often gets misidentified)
|
359 |
+
historical_conflict = False
|
360 |
+
if (primary_lang == "French" and secondary_lang == "Spanish" and
|
361 |
+
historical_bonus.get("French", 0) > 5):
|
362 |
+
historical_conflict = True
|
363 |
+
logger.info("Historical French markers detected, suppressing Spanish detection")
|
364 |
+
|
365 |
+
if ratio >= self.secondary_lang_threshold and not historical_conflict:
|
366 |
+
result.append(secondary_lang)
|
367 |
+
logger.info(f"Added secondary language {secondary_lang} (score ratio: {ratio:.2f})")
|
368 |
+
else:
|
369 |
+
logger.info(f"Rejected secondary language {secondary_lang} (score ratio: {ratio:.2f})")
|
370 |
+
|
371 |
+
return result
|
372 |
+
|
373 |
+
# Default to English if no clear signals
|
utils/helpers/letterhead_handler.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Standard library imports
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
# Configure logging
|
7 |
+
logging.basicConfig(level=logging.INFO,
|
8 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
def is_likely_letterhead(file_path, features=None):
|
12 |
+
"""
|
13 |
+
Determine if a document is likely to contain letterhead or marginalia
|
14 |
+
|
15 |
+
Args:
|
16 |
+
file_path: Path to the document image
|
17 |
+
features: Optional dictionary of pre-extracted features like text density
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
bool: True if the document likely contains letterhead, False otherwise
|
21 |
+
"""
|
22 |
+
# Simple logic based on filename for initial version
|
23 |
+
file_name = Path(file_path).name.lower()
|
24 |
+
letterhead_indicators = ['letter', 'letterhead', 'correspondence', 'memo']
|
25 |
+
|
26 |
+
# Check filename for indicators
|
27 |
+
for indicator in letterhead_indicators:
|
28 |
+
if indicator in file_name:
|
29 |
+
logger.info(f"Letterhead detected based on filename: {file_name}")
|
30 |
+
return True
|
31 |
+
|
32 |
+
# Check features if provided
|
33 |
+
if features:
|
34 |
+
# High text density at the top of the document may indicate letterhead
|
35 |
+
if 'top_density' in features and features['top_density'] > 0.5:
|
36 |
+
logger.info(f"Letterhead detected based on top text density: {features['top_density']}")
|
37 |
+
return True
|
38 |
+
|
39 |
+
# Uneven text distribution may indicate marginalia
|
40 |
+
if 'density_variance' in features and features['density_variance'] > 0.3:
|
41 |
+
logger.info(f"Possible marginalia detected based on text density variance")
|
42 |
+
return True
|
43 |
+
|
44 |
+
# Default to standard document
|
45 |
+
return False
|
46 |
+
|
47 |
+
def get_letterhead_prompt(file_path, features=None):
|
48 |
+
"""
|
49 |
+
Generate a specialized prompt for letterhead document OCR
|
50 |
+
|
51 |
+
Args:
|
52 |
+
file_path: Path to the document image
|
53 |
+
features: Optional dictionary of pre-extracted features
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
str: Specialized prompt for letterhead document OCR
|
57 |
+
"""
|
58 |
+
# Base prompt for all letterhead documents
|
59 |
+
base_prompt = ("This document appears to be a letter or includes letterhead elements. "
|
60 |
+
"Please extract the following components separately if present:\n"
|
61 |
+
"1. Letterhead (header with logo, organization name, address, etc.)\n"
|
62 |
+
"2. Date\n"
|
63 |
+
"3. Recipient information (address, name, title)\n"
|
64 |
+
"4. Salutation (e.g., 'Dear Sir/Madam')\n"
|
65 |
+
"5. Main body text\n"
|
66 |
+
"6. Closing (e.g., 'Sincerely')\n"
|
67 |
+
"7. Signature\n"
|
68 |
+
"8. Any footnotes, marginalia, or annotations\n\n"
|
69 |
+
"Preserve the original formatting and structure as much as possible.")
|
70 |
+
|
71 |
+
# Enhanced prompts based on features
|
72 |
+
if features:
|
73 |
+
# Extract additional context from features if available
|
74 |
+
if 'is_historical' in features and features['is_historical']:
|
75 |
+
base_prompt += ("\n\nThis appears to be a historical document. Pay special attention to older "
|
76 |
+
"letterhead styles, formal language patterns, and period-specific formatting.")
|
77 |
+
|
78 |
+
if 'has_marginalia' in features and features['has_marginalia']:
|
79 |
+
base_prompt += ("\n\nThe document contains marginalia or handwritten notes in the margins. "
|
80 |
+
"Please extract these separately from the main text and indicate their position.")
|
81 |
+
|
82 |
+
return base_prompt
|
utils/helpers/ocr_text_repair.py
ADDED
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Standard library imports
|
2 |
+
import re
|
3 |
+
import logging
|
4 |
+
from difflib import SequenceMatcher
|
5 |
+
from typing import Tuple, Dict, Any, List, Optional
|
6 |
+
|
7 |
+
# Configure logging
|
8 |
+
logging.basicConfig(level=logging.INFO,
|
9 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
def detect_duplicate_text_issues(text: str) -> Tuple[bool, Dict[str, Any]]:
|
13 |
+
"""
|
14 |
+
Detect if OCR text has duplication issues often found in handwritten document OCR
|
15 |
+
|
16 |
+
Args:
|
17 |
+
text: OCR text to analyze
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
Tuple of (has_duplication_issues, details_dict)
|
21 |
+
"""
|
22 |
+
# Early exit for empty text
|
23 |
+
if not text or len(text) < 100:
|
24 |
+
return False, {"duplication_rate": 0.0, "details": "Text too short for analysis"}
|
25 |
+
|
26 |
+
# Look for repeated line patterns
|
27 |
+
lines = text.split('\n')
|
28 |
+
line_count = len(lines)
|
29 |
+
|
30 |
+
# Basic metrics
|
31 |
+
repeated_lines = 0
|
32 |
+
duplicate_sections = []
|
33 |
+
line_repetition_indices = []
|
34 |
+
|
35 |
+
# Check for exact line repetitions
|
36 |
+
seen_lines = {}
|
37 |
+
for i, line in enumerate(lines):
|
38 |
+
# Skip very short lines or empty lines
|
39 |
+
stripped = line.strip()
|
40 |
+
if len(stripped) < 5:
|
41 |
+
continue
|
42 |
+
|
43 |
+
if stripped in seen_lines:
|
44 |
+
repeated_lines += 1
|
45 |
+
line_repetition_indices.append((seen_lines[stripped], i))
|
46 |
+
else:
|
47 |
+
seen_lines[stripped] = i
|
48 |
+
|
49 |
+
# Calculate line repetition rate
|
50 |
+
line_repetition_rate = repeated_lines / max(1, line_count)
|
51 |
+
|
52 |
+
# Look for longer repeated sections using sequence matcher
|
53 |
+
text_blocks = [text[i:i+100] for i in range(0, len(text), 100) if i+100 <= len(text)]
|
54 |
+
block_count = len(text_blocks)
|
55 |
+
|
56 |
+
repeated_blocks = 0
|
57 |
+
for i in range(block_count):
|
58 |
+
for j in range(i+1, min(i+10, block_count)): # Only check nearby blocks for efficiency
|
59 |
+
matcher = SequenceMatcher(None, text_blocks[i], text_blocks[j])
|
60 |
+
similarity = matcher.ratio()
|
61 |
+
if similarity > 0.8: # High similarity threshold
|
62 |
+
repeated_blocks += 1
|
63 |
+
duplicate_sections.append((i, j, similarity))
|
64 |
+
break
|
65 |
+
|
66 |
+
# Calculate block repetition rate
|
67 |
+
block_repetition_rate = repeated_blocks / max(1, block_count)
|
68 |
+
|
69 |
+
# Combine metrics for overall duplication rate
|
70 |
+
duplication_rate = max(line_repetition_rate, block_repetition_rate)
|
71 |
+
|
72 |
+
# Detect patterns of repeated words in sequence (common OCR mistake)
|
73 |
+
word_pattern = r'\b(\w+)\s+\1\b'
|
74 |
+
repeated_words = len(re.findall(word_pattern, text))
|
75 |
+
repeated_words_rate = repeated_words / max(1, len(text.split()))
|
76 |
+
|
77 |
+
# Update duplication rate with word repetition
|
78 |
+
duplication_rate = max(duplication_rate, repeated_words_rate)
|
79 |
+
|
80 |
+
# Log detailed analysis
|
81 |
+
logger.info(f"OCR duplication analysis: line_repetition={line_repetition_rate:.2f}, "
|
82 |
+
f"block_repetition={block_repetition_rate:.2f}, "
|
83 |
+
f"word_repetition={repeated_words_rate:.2f}, "
|
84 |
+
f"final_rate={duplication_rate:.2f}")
|
85 |
+
|
86 |
+
# Determine if this is a serious issue
|
87 |
+
has_duplication = duplication_rate > 0.1
|
88 |
+
|
89 |
+
# Return detailed results
|
90 |
+
return has_duplication, {
|
91 |
+
"duplication_rate": duplication_rate,
|
92 |
+
"line_repetition_rate": line_repetition_rate,
|
93 |
+
"block_repetition_rate": block_repetition_rate,
|
94 |
+
"word_repetition_rate": repeated_words_rate,
|
95 |
+
"repeated_lines": repeated_lines,
|
96 |
+
"repeated_blocks": repeated_blocks,
|
97 |
+
"repeated_words": repeated_words,
|
98 |
+
"duplicate_sections": duplicate_sections[:10], # Only include the first 10 for brevity
|
99 |
+
"repetition_indices": line_repetition_indices[:10]
|
100 |
+
}
|
101 |
+
|
102 |
+
def get_enhanced_preprocessing_options(current_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
103 |
+
"""
|
104 |
+
Generate enhanced preprocessing options for improved OCR on handwritten documents
|
105 |
+
|
106 |
+
Args:
|
107 |
+
current_options: Current preprocessing options (if available)
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
Dict of enhanced options
|
111 |
+
"""
|
112 |
+
# Start with current options or empty dict
|
113 |
+
options = current_options.copy() if current_options else {}
|
114 |
+
|
115 |
+
# Set document type to handwritten
|
116 |
+
options["document_type"] = "handwritten"
|
117 |
+
|
118 |
+
# Enhanced contrast - higher than normal for better handwriting extraction
|
119 |
+
options["contrast"] = 1.4 # Higher than default
|
120 |
+
|
121 |
+
# Apply grayscale
|
122 |
+
options["grayscale"] = True
|
123 |
+
|
124 |
+
# Apply adaptive thresholding optimized for handwriting
|
125 |
+
options["adaptive_threshold"] = True
|
126 |
+
options["threshold_block_size"] = 25 # Larger block size for handwriting
|
127 |
+
options["threshold_c"] = 10 # Adjusted C value for better handwriting detection
|
128 |
+
|
129 |
+
# Disable standard binarization which often loses handwriting detail
|
130 |
+
options["binarize"] = False
|
131 |
+
|
132 |
+
# Despeckle to reduce noise
|
133 |
+
options["denoise"] = True
|
134 |
+
|
135 |
+
# Enable handwriting-specific preprocessing
|
136 |
+
options["handwriting_mode"] = True
|
137 |
+
|
138 |
+
# Disable anything that might harm handwriting recognition
|
139 |
+
if "sharpen" in options:
|
140 |
+
options["sharpen"] = False
|
141 |
+
|
142 |
+
logger.info(f"Enhanced handwriting preprocessing options generated: {options}")
|
143 |
+
return options
|
144 |
+
|
145 |
+
def get_handwritten_specific_prompt(current_prompt: Optional[str] = None) -> str:
|
146 |
+
"""
|
147 |
+
Generate a specialized prompt for handwritten document OCR
|
148 |
+
|
149 |
+
Args:
|
150 |
+
current_prompt: Current prompt (if available)
|
151 |
+
|
152 |
+
Returns:
|
153 |
+
str: Enhanced prompt for handwritten documents
|
154 |
+
"""
|
155 |
+
# Base prompt for all handwritten documents
|
156 |
+
base_prompt = ("This is a handwritten document that requires careful transcription. "
|
157 |
+
"Please transcribe all visible handwritten text, preserving the original "
|
158 |
+
"line breaks, paragraph structure, and any special formatting or indentation. "
|
159 |
+
"Pay special attention to:\n"
|
160 |
+
"1. Words that may be difficult to read due to handwriting style\n"
|
161 |
+
"2. Any crossed-out text (indicate with [crossed out: possible text])\n"
|
162 |
+
"3. Insertions or annotations between lines or in margins\n"
|
163 |
+
"4. Maintain the spatial layout of the text as much as possible\n"
|
164 |
+
"5. If there are multiple columns or non-linear text, preserve the reading order\n\n"
|
165 |
+
"If you cannot read a word with confidence, indicate with [?] or provide your best guess as [word?].")
|
166 |
+
|
167 |
+
# If there's an existing prompt, combine them, otherwise just use the base
|
168 |
+
if current_prompt:
|
169 |
+
# Remove any redundant instructions about handwriting
|
170 |
+
lower_prompt = current_prompt.lower()
|
171 |
+
if "handwritten" in lower_prompt or "handwriting" in lower_prompt:
|
172 |
+
# Extract any unique instructions from the current prompt
|
173 |
+
# This logic is simplified and might need improvement
|
174 |
+
current_sentences = [s.strip() for s in current_prompt.split('.') if s.strip()]
|
175 |
+
handwriting_sentences = [s for s in current_sentences
|
176 |
+
if "handwritten" not in s.lower()
|
177 |
+
and "handwriting" not in s.lower()]
|
178 |
+
|
179 |
+
# Add unique instructions to our base prompt
|
180 |
+
if handwriting_sentences:
|
181 |
+
combined_prompt = base_prompt + "\n\nAdditional instructions:\n"
|
182 |
+
combined_prompt += ". ".join(handwriting_sentences) + "."
|
183 |
+
return combined_prompt
|
184 |
+
else:
|
185 |
+
# If no handwriting instructions in the current prompt, just append it
|
186 |
+
return f"{base_prompt}\n\nAdditional context from user:\n{current_prompt}"
|
187 |
+
|
188 |
+
return base_prompt
|
189 |
+
|
190 |
+
def clean_duplicated_text(text: str) -> str:
|
191 |
+
"""
|
192 |
+
Clean up duplicated text often found in OCR output for handwritten documents
|
193 |
+
|
194 |
+
Args:
|
195 |
+
text: OCR text to clean
|
196 |
+
|
197 |
+
Returns:
|
198 |
+
str: Cleaned text with duplications removed
|
199 |
+
"""
|
200 |
+
if not text:
|
201 |
+
return text
|
202 |
+
|
203 |
+
# Split into lines for line-based deduplication
|
204 |
+
lines = text.split('\n')
|
205 |
+
|
206 |
+
# Remove consecutive duplicate lines
|
207 |
+
deduped_lines = []
|
208 |
+
prev_line = None
|
209 |
+
|
210 |
+
for line in lines:
|
211 |
+
stripped = line.strip()
|
212 |
+
# Skip empty lines
|
213 |
+
if not stripped:
|
214 |
+
if not deduped_lines or deduped_lines[-1].strip():
|
215 |
+
deduped_lines.append(line) # Keep the first empty line
|
216 |
+
continue
|
217 |
+
|
218 |
+
# Skip if this line is a duplicate of the previous line
|
219 |
+
if stripped == prev_line:
|
220 |
+
continue
|
221 |
+
|
222 |
+
deduped_lines.append(line)
|
223 |
+
prev_line = stripped
|
224 |
+
|
225 |
+
# Re-join the deduplicated lines
|
226 |
+
deduped_text = '\n'.join(deduped_lines)
|
227 |
+
|
228 |
+
# Remove repeated words
|
229 |
+
word_pattern = r'\b(\w+)\s+\1\b'
|
230 |
+
deduped_text = re.sub(word_pattern, r'\1', deduped_text)
|
231 |
+
|
232 |
+
# Remove repeated phrases (3+ words)
|
233 |
+
# This is a simplified approach and might need improvement
|
234 |
+
words = deduped_text.split()
|
235 |
+
cleaned_words = []
|
236 |
+
i = 0
|
237 |
+
|
238 |
+
while i < len(words):
|
239 |
+
# Check for phrase repetition (phrases of 3 to 6 words)
|
240 |
+
found_repeat = False
|
241 |
+
|
242 |
+
for phrase_len in range(3, min(7, len(words) - i)):
|
243 |
+
phrase = ' '.join(words[i:i+phrase_len])
|
244 |
+
next_pos = i + phrase_len
|
245 |
+
|
246 |
+
if next_pos + phrase_len <= len(words):
|
247 |
+
next_phrase = ' '.join(words[next_pos:next_pos+phrase_len])
|
248 |
+
|
249 |
+
if phrase.lower() == next_phrase.lower():
|
250 |
+
# Found a repeated phrase, skip the second occurrence
|
251 |
+
cleaned_words.extend(words[i:i+phrase_len])
|
252 |
+
i = next_pos + phrase_len
|
253 |
+
found_repeat = True
|
254 |
+
break
|
255 |
+
|
256 |
+
if not found_repeat:
|
257 |
+
cleaned_words.append(words[i])
|
258 |
+
i += 1
|
259 |
+
|
260 |
+
# Rejoin the cleaned words
|
261 |
+
final_text = ' '.join(cleaned_words)
|
262 |
+
|
263 |
+
# Log the cleaning results
|
264 |
+
original_len = len(text)
|
265 |
+
cleaned_len = len(final_text)
|
266 |
+
reduction = 100 * (original_len - cleaned_len) / max(1, original_len)
|
267 |
+
|
268 |
+
logger.info(f"Text cleaning: removed {original_len - cleaned_len} chars ({reduction:.1f}% reduction)")
|
269 |
+
|
270 |
+
return final_text
|
utils/pdf_ocr.py
ADDED
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
PDFOCR - Module for processing PDF files with OCR and extracting structured data.
|
4 |
+
Provides robust PDF to image conversion before OCR processing.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import tempfile
|
10 |
+
import logging
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Optional, Dict, List, Union, Tuple, Any
|
13 |
+
|
14 |
+
# Configure logging
|
15 |
+
logging.basicConfig(level=logging.INFO,
|
16 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
17 |
+
logger = logging.getLogger("pdf_ocr")
|
18 |
+
|
19 |
+
# Import StructuredOCR for OCR processing
|
20 |
+
from structured_ocr import StructuredOCR
|
21 |
+
|
22 |
+
class PDFConversionResult:
|
23 |
+
"""Class to hold results of PDF to image conversion."""
|
24 |
+
|
25 |
+
def __init__(self,
|
26 |
+
success: bool,
|
27 |
+
images: List[Path] = None,
|
28 |
+
error: str = None,
|
29 |
+
page_count: int = 0,
|
30 |
+
temp_files: List[str] = None):
|
31 |
+
"""Initialize the conversion result.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
success: Whether the conversion was successful
|
35 |
+
images: List of paths to the converted images
|
36 |
+
error: Error message if conversion failed
|
37 |
+
page_count: Total number of pages in the PDF
|
38 |
+
temp_files: List of temporary files that should be cleaned up
|
39 |
+
"""
|
40 |
+
self.success = success
|
41 |
+
self.images = images or []
|
42 |
+
self.error = error
|
43 |
+
self.page_count = page_count
|
44 |
+
self.temp_files = temp_files or []
|
45 |
+
|
46 |
+
def __bool__(self):
|
47 |
+
"""Enable boolean evaluation of the result."""
|
48 |
+
return self.success
|
49 |
+
|
50 |
+
def cleanup(self):
|
51 |
+
"""Clean up any temporary files created during conversion."""
|
52 |
+
for temp_file in self.temp_files:
|
53 |
+
try:
|
54 |
+
if os.path.exists(temp_file):
|
55 |
+
os.unlink(temp_file)
|
56 |
+
logger.debug(f"Removed temporary file: {temp_file}")
|
57 |
+
except Exception as e:
|
58 |
+
logger.warning(f"Failed to remove temporary file {temp_file}: {e}")
|
59 |
+
self.temp_files = []
|
60 |
+
|
61 |
+
|
62 |
+
class PDFOCR:
|
63 |
+
"""Class for processing PDF files with OCR and extracting structured data."""
|
64 |
+
|
65 |
+
def __init__(self, api_key=None):
|
66 |
+
"""Initialize the PDF OCR processor."""
|
67 |
+
self.processor = StructuredOCR(api_key=api_key)
|
68 |
+
self.temp_files = []
|
69 |
+
|
70 |
+
def __del__(self):
|
71 |
+
"""Clean up resources when object is destroyed."""
|
72 |
+
self.cleanup()
|
73 |
+
|
74 |
+
def cleanup(self):
|
75 |
+
"""Clean up any temporary files."""
|
76 |
+
for temp_file in self.temp_files:
|
77 |
+
try:
|
78 |
+
if os.path.exists(temp_file):
|
79 |
+
os.unlink(temp_file)
|
80 |
+
logger.debug(f"Removed temporary file: {temp_file}")
|
81 |
+
except Exception as e:
|
82 |
+
logger.warning(f"Failed to remove temporary file {temp_file}: {e}")
|
83 |
+
self.temp_files = []
|
84 |
+
|
85 |
+
def convert_pdf_to_images(self,
|
86 |
+
pdf_path: Union[str, Path],
|
87 |
+
dpi: int = 200,
|
88 |
+
max_pages: Optional[int] = None,
|
89 |
+
page_numbers: Optional[List[int]] = None) -> PDFConversionResult:
|
90 |
+
"""
|
91 |
+
Convert a PDF file to images.
|
92 |
+
|
93 |
+
Args:
|
94 |
+
pdf_path: Path to the PDF file
|
95 |
+
dpi: DPI for the output images
|
96 |
+
max_pages: Maximum number of pages to convert (None for all)
|
97 |
+
page_numbers: Specific page numbers to convert (1-based indexing)
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
PDFConversionResult object with conversion results
|
101 |
+
"""
|
102 |
+
pdf_path = Path(pdf_path)
|
103 |
+
if not pdf_path.exists():
|
104 |
+
return PDFConversionResult(
|
105 |
+
success=False,
|
106 |
+
error=f"PDF file not found: {pdf_path}"
|
107 |
+
)
|
108 |
+
|
109 |
+
# Check file size
|
110 |
+
file_size_mb = pdf_path.stat().st_size / (1024 * 1024)
|
111 |
+
logger.info(f"PDF size: {file_size_mb:.2f} MB")
|
112 |
+
|
113 |
+
try:
|
114 |
+
# Import pdf2image for conversion
|
115 |
+
import pdf2image
|
116 |
+
|
117 |
+
# Initialize list for temporary files
|
118 |
+
temp_files = []
|
119 |
+
|
120 |
+
# Optimize conversion parameters based on file size
|
121 |
+
thread_count = min(4, os.cpu_count() or 2)
|
122 |
+
|
123 |
+
# First, determine total pages in the document
|
124 |
+
logger.info("Determining PDF page count...")
|
125 |
+
try:
|
126 |
+
# Use a lightweight approach with multi-threading for faster processing
|
127 |
+
pdf_info = pdf2image.convert_from_path(
|
128 |
+
pdf_path,
|
129 |
+
dpi=72, # Low DPI just for info
|
130 |
+
first_page=1,
|
131 |
+
last_page=1,
|
132 |
+
size=(100, 100), # Tiny image to save memory
|
133 |
+
fmt="jpeg",
|
134 |
+
thread_count=thread_count,
|
135 |
+
output_file=None
|
136 |
+
)
|
137 |
+
|
138 |
+
# Get page count from poppler info if available
|
139 |
+
if hasattr(pdf_info, 'n_pages'):
|
140 |
+
total_pages = pdf_info.n_pages
|
141 |
+
else:
|
142 |
+
# Try a different approach to get page count
|
143 |
+
try:
|
144 |
+
from pypdf import PdfReader
|
145 |
+
reader = PdfReader(pdf_path)
|
146 |
+
total_pages = len(reader.pages)
|
147 |
+
except:
|
148 |
+
total_pages = 1
|
149 |
+
logger.warning("Could not determine total page count, assuming 1 page")
|
150 |
+
except Exception as e:
|
151 |
+
logger.warning(f"Failed to determine page count: {e}")
|
152 |
+
total_pages = 1
|
153 |
+
|
154 |
+
logger.info(f"PDF has {total_pages} total pages")
|
155 |
+
|
156 |
+
# Determine which pages to process
|
157 |
+
pages_to_process = []
|
158 |
+
|
159 |
+
# If specific pages are requested, use those
|
160 |
+
if page_numbers and any(1 <= p <= total_pages for p in page_numbers):
|
161 |
+
pages_to_process = [p for p in page_numbers if 1 <= p <= total_pages]
|
162 |
+
logger.info(f"Converting {len(pages_to_process)} specified pages: {pages_to_process}")
|
163 |
+
# If max_pages is set, limit to that number
|
164 |
+
elif max_pages and max_pages < total_pages:
|
165 |
+
pages_to_process = list(range(1, max_pages + 1))
|
166 |
+
logger.info(f"Converting first {max_pages} pages of {total_pages} total")
|
167 |
+
# Otherwise convert all pages if reasonable count
|
168 |
+
else:
|
169 |
+
pages_to_process = list(range(1, total_pages + 1))
|
170 |
+
logger.info(f"Converting all {total_pages} pages")
|
171 |
+
|
172 |
+
# Convert PDF to images
|
173 |
+
converted_images = []
|
174 |
+
|
175 |
+
# Process in batches for better memory management
|
176 |
+
batch_size = min(5, len(pages_to_process)) # Process up to 5 pages at a time
|
177 |
+
for i in range(0, len(pages_to_process), batch_size):
|
178 |
+
batch_pages = pages_to_process[i:i+batch_size]
|
179 |
+
logger.info(f"Converting batch of pages {batch_pages}")
|
180 |
+
|
181 |
+
# Convert this batch of pages
|
182 |
+
try:
|
183 |
+
batch_images = pdf2image.convert_from_path(
|
184 |
+
pdf_path,
|
185 |
+
dpi=dpi,
|
186 |
+
first_page=min(batch_pages),
|
187 |
+
last_page=max(batch_pages),
|
188 |
+
thread_count=thread_count,
|
189 |
+
fmt="jpeg"
|
190 |
+
)
|
191 |
+
|
192 |
+
# Map converted images to requested page numbers
|
193 |
+
for idx, page_num in enumerate(range(min(batch_pages), max(batch_pages) + 1)):
|
194 |
+
if page_num in pages_to_process and idx < len(batch_images):
|
195 |
+
# Save the image to a temporary file
|
196 |
+
img_temp_path = tempfile.NamedTemporaryFile(suffix=f'_page{page_num}.jpg', delete=False).name
|
197 |
+
batch_images[idx].save(img_temp_path, format='JPEG', quality=95)
|
198 |
+
|
199 |
+
# Add to results and track the temp file
|
200 |
+
converted_images.append((page_num, Path(img_temp_path)))
|
201 |
+
temp_files.append(img_temp_path)
|
202 |
+
except Exception as e:
|
203 |
+
logger.error(f"Failed to convert batch {batch_pages}: {e}")
|
204 |
+
# Continue with other batches
|
205 |
+
|
206 |
+
# Sort by page number to ensure correct order
|
207 |
+
converted_images.sort(key=lambda x: x[0])
|
208 |
+
|
209 |
+
# Extract just the image paths in correct page order
|
210 |
+
image_paths = [img_path for _, img_path in converted_images]
|
211 |
+
|
212 |
+
if not image_paths:
|
213 |
+
# No images were successfully converted
|
214 |
+
return PDFConversionResult(
|
215 |
+
success=False,
|
216 |
+
error="Failed to convert PDF to images",
|
217 |
+
page_count=total_pages,
|
218 |
+
temp_files=temp_files
|
219 |
+
)
|
220 |
+
|
221 |
+
# Store temp files for later cleanup
|
222 |
+
self.temp_files.extend(temp_files)
|
223 |
+
|
224 |
+
# Return successful result
|
225 |
+
return PDFConversionResult(
|
226 |
+
success=True,
|
227 |
+
images=image_paths,
|
228 |
+
page_count=total_pages,
|
229 |
+
temp_files=temp_files
|
230 |
+
)
|
231 |
+
|
232 |
+
except ImportError:
|
233 |
+
return PDFConversionResult(
|
234 |
+
success=False,
|
235 |
+
error="pdf2image module not available. Please install with: pip install pdf2image"
|
236 |
+
)
|
237 |
+
except Exception as e:
|
238 |
+
logger.error(f"PDF conversion error: {str(e)}")
|
239 |
+
return PDFConversionResult(
|
240 |
+
success=False,
|
241 |
+
error=f"Failed to convert PDF to images: {str(e)}"
|
242 |
+
)
|
243 |
+
|
244 |
+
def process_pdf(self, pdf_path, use_vision=True, max_pages=None, custom_pages=None, custom_prompt=None):
|
245 |
+
"""
|
246 |
+
Process a PDF file with OCR and extract structured data.
|
247 |
+
|
248 |
+
Args:
|
249 |
+
pdf_path: Path to the PDF file
|
250 |
+
use_vision: Whether to use vision model for improved analysis
|
251 |
+
max_pages: Maximum number of pages to process
|
252 |
+
custom_pages: Specific page numbers to process (1-based indexing)
|
253 |
+
custom_prompt: Custom instructions for processing
|
254 |
+
|
255 |
+
Returns:
|
256 |
+
Dictionary with structured OCR results
|
257 |
+
"""
|
258 |
+
pdf_path = Path(pdf_path)
|
259 |
+
if not pdf_path.exists():
|
260 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
261 |
+
|
262 |
+
# Convert page numbers to list if provided
|
263 |
+
page_numbers = None
|
264 |
+
if custom_pages:
|
265 |
+
if isinstance(custom_pages, (list, tuple)):
|
266 |
+
page_numbers = custom_pages
|
267 |
+
else:
|
268 |
+
try:
|
269 |
+
# Try to parse as comma-separated string
|
270 |
+
page_numbers = [int(p.strip()) for p in str(custom_pages).split(',')]
|
271 |
+
except:
|
272 |
+
logger.warning(f"Invalid custom_pages format: {custom_pages}. Should be list or comma-separated string.")
|
273 |
+
|
274 |
+
# First try our optimized PDF to image conversion
|
275 |
+
conversion_result = self.convert_pdf_to_images(
|
276 |
+
pdf_path=pdf_path,
|
277 |
+
max_pages=max_pages,
|
278 |
+
page_numbers=page_numbers
|
279 |
+
)
|
280 |
+
|
281 |
+
if conversion_result.success and conversion_result.images:
|
282 |
+
logger.info(f"Successfully converted PDF to {len(conversion_result.images)} images")
|
283 |
+
|
284 |
+
# Determine if we need to add PDF-specific context to the prompt
|
285 |
+
modified_prompt = custom_prompt
|
286 |
+
if not modified_prompt:
|
287 |
+
modified_prompt = f"This is a multi-page PDF document with {conversion_result.page_count} total pages, of which {len(conversion_result.images)} were processed."
|
288 |
+
elif "pdf" not in modified_prompt.lower() and "multi-page" not in modified_prompt.lower():
|
289 |
+
modified_prompt += f" This is a multi-page PDF document with {conversion_result.page_count} total pages, of which {len(conversion_result.images)} were processed."
|
290 |
+
|
291 |
+
try:
|
292 |
+
# First process the first page with vision if requested
|
293 |
+
first_page_result = self.processor.process_file(
|
294 |
+
file_path=conversion_result.images[0],
|
295 |
+
file_type="image",
|
296 |
+
use_vision=use_vision,
|
297 |
+
custom_prompt=modified_prompt
|
298 |
+
)
|
299 |
+
|
300 |
+
# Process additional pages if available
|
301 |
+
all_pages_text = []
|
302 |
+
all_languages = set()
|
303 |
+
|
304 |
+
# Extract text from first page
|
305 |
+
if 'ocr_contents' in first_page_result and 'raw_text' in first_page_result['ocr_contents']:
|
306 |
+
all_pages_text.append(first_page_result['ocr_contents']['raw_text'])
|
307 |
+
|
308 |
+
# Track languages from first page
|
309 |
+
if 'languages' in first_page_result:
|
310 |
+
for lang in first_page_result['languages']:
|
311 |
+
all_languages.add(str(lang))
|
312 |
+
|
313 |
+
# Process additional pages if any
|
314 |
+
for i, img_path in enumerate(conversion_result.images[1:], 1):
|
315 |
+
try:
|
316 |
+
# Simple text extraction for additional pages
|
317 |
+
page_result = self.processor.process_file(
|
318 |
+
file_path=img_path,
|
319 |
+
file_type="image",
|
320 |
+
use_vision=False, # Use simpler processing for additional pages
|
321 |
+
custom_prompt=f"This is page {i+1} of a {conversion_result.page_count}-page document."
|
322 |
+
)
|
323 |
+
|
324 |
+
# Extract text
|
325 |
+
if 'ocr_contents' in page_result and 'raw_text' in page_result['ocr_contents']:
|
326 |
+
all_pages_text.append(page_result['ocr_contents']['raw_text'])
|
327 |
+
|
328 |
+
# Track languages
|
329 |
+
if 'languages' in page_result:
|
330 |
+
for lang in page_result['languages']:
|
331 |
+
all_languages.add(str(lang))
|
332 |
+
except Exception as e:
|
333 |
+
logger.warning(f"Error processing page {i+1}: {e}")
|
334 |
+
|
335 |
+
# Combine all text into a single document
|
336 |
+
combined_text = "\n\n".join(all_pages_text)
|
337 |
+
|
338 |
+
# Update the first page result with combined data
|
339 |
+
if 'ocr_contents' in first_page_result:
|
340 |
+
first_page_result['ocr_contents']['raw_text'] = combined_text
|
341 |
+
|
342 |
+
# Update languages with all detected languages
|
343 |
+
if all_languages:
|
344 |
+
first_page_result['languages'] = list(all_languages)
|
345 |
+
|
346 |
+
# Add PDF metadata
|
347 |
+
first_page_result['file_name'] = pdf_path.name
|
348 |
+
first_page_result['file_type'] = "pdf"
|
349 |
+
first_page_result['total_pages'] = conversion_result.page_count
|
350 |
+
first_page_result['processed_pages'] = len(conversion_result.images)
|
351 |
+
|
352 |
+
# Add conversion info
|
353 |
+
first_page_result['pdf_conversion'] = {
|
354 |
+
"method": "pdf2image",
|
355 |
+
"pages_converted": len(conversion_result.images),
|
356 |
+
"pages_requested": len(page_numbers) if page_numbers else (max_pages or conversion_result.page_count)
|
357 |
+
}
|
358 |
+
|
359 |
+
return first_page_result
|
360 |
+
except Exception as e:
|
361 |
+
logger.error(f"Error processing converted images: {e}")
|
362 |
+
# Fall back to direct processing via StructuredOCR
|
363 |
+
|
364 |
+
finally:
|
365 |
+
# Clean up temporary files
|
366 |
+
conversion_result.cleanup()
|
367 |
+
|
368 |
+
# If conversion failed or processing the images failed, fall back to direct processing
|
369 |
+
logger.info(f"Using direct StructuredOCR processing for PDF")
|
370 |
+
return self.processor.process_file(
|
371 |
+
file_path=pdf_path,
|
372 |
+
file_type="pdf",
|
373 |
+
use_vision=use_vision,
|
374 |
+
max_pages=max_pages,
|
375 |
+
custom_pages=custom_pages,
|
376 |
+
custom_prompt=custom_prompt
|
377 |
+
)
|
378 |
+
|
379 |
+
def save_json_output(self, pdf_path, output_path, use_vision=True, max_pages=None, custom_pages=None, custom_prompt=None):
|
380 |
+
"""
|
381 |
+
Process a PDF file and save the structured output as JSON.
|
382 |
+
|
383 |
+
Args:
|
384 |
+
pdf_path: Path to the PDF file
|
385 |
+
output_path: Path where to save the JSON output
|
386 |
+
use_vision: Whether to use vision model for improved analysis
|
387 |
+
max_pages: Maximum number of pages to process
|
388 |
+
custom_pages: Specific page numbers to process (1-based indexing)
|
389 |
+
custom_prompt: Custom instructions for processing
|
390 |
+
|
391 |
+
Returns:
|
392 |
+
Path to the saved JSON file
|
393 |
+
"""
|
394 |
+
# Process the PDF
|
395 |
+
result = self.process_pdf(
|
396 |
+
pdf_path,
|
397 |
+
use_vision=use_vision,
|
398 |
+
max_pages=max_pages,
|
399 |
+
custom_pages=custom_pages,
|
400 |
+
custom_prompt=custom_prompt
|
401 |
+
)
|
402 |
+
|
403 |
+
# Save the result to JSON
|
404 |
+
output_path = Path(output_path)
|
405 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
406 |
+
|
407 |
+
with open(output_path, 'w') as f:
|
408 |
+
json.dump(result, f, indent=2)
|
409 |
+
|
410 |
+
return output_path
|
411 |
+
|
412 |
+
# For testing directly
|
413 |
+
if __name__ == "__main__":
|
414 |
+
import sys
|
415 |
+
import argparse
|
416 |
+
|
417 |
+
parser = argparse.ArgumentParser(description="Process PDF files with OCR.")
|
418 |
+
parser.add_argument("pdf_path", help="Path to the PDF file to process")
|
419 |
+
parser.add_argument("--output", "-o", help="Path to save the output JSON")
|
420 |
+
parser.add_argument("--no-vision", dest="use_vision", action="store_false",
|
421 |
+
help="Disable vision model for processing")
|
422 |
+
parser.add_argument("--max-pages", type=int, help="Maximum number of pages to process")
|
423 |
+
parser.add_argument("--pages", help="Specific pages to process (comma-separated)")
|
424 |
+
parser.add_argument("--prompt", help="Custom prompt for processing")
|
425 |
+
|
426 |
+
args = parser.parse_args()
|
427 |
+
|
428 |
+
processor = PDFOCR()
|
429 |
+
|
430 |
+
# Parse custom pages if provided
|
431 |
+
custom_pages = None
|
432 |
+
if args.pages:
|
433 |
+
try:
|
434 |
+
custom_pages = [int(p.strip()) for p in args.pages.split(',')]
|
435 |
+
except:
|
436 |
+
print(f"Error parsing pages: {args.pages}. Should be comma-separated list of numbers.")
|
437 |
+
sys.exit(1)
|
438 |
+
|
439 |
+
if args.output:
|
440 |
+
result_path = processor.save_json_output(
|
441 |
+
args.pdf_path,
|
442 |
+
args.output,
|
443 |
+
use_vision=args.use_vision,
|
444 |
+
max_pages=args.max_pages,
|
445 |
+
custom_pages=custom_pages,
|
446 |
+
custom_prompt=args.prompt
|
447 |
+
)
|
448 |
+
print(f"Results saved to: {result_path}")
|
449 |
+
else:
|
450 |
+
result = processor.process_pdf(
|
451 |
+
args.pdf_path,
|
452 |
+
use_vision=args.use_vision,
|
453 |
+
max_pages=args.max_pages,
|
454 |
+
custom_pages=custom_pages,
|
455 |
+
custom_prompt=args.prompt
|
456 |
+
)
|
457 |
+
print(json.dumps(result, indent=2))
|