milwright commited on
Commit
94e74f0
·
1 Parent(s): 3030658

modularize + nest scripts; reduce technical debt

Browse files
.clinerules/hocr-basics-api.md ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HOCR Basics: API Integrations (Streamlit and Mistral OCR)
2
+
3
+ This rule defines the essential development standards for integrating the Mistral OCR API and using Streamlit components in the `milwright/historical-ocr` application.
4
+
5
+ ## 📌 Rule 1: Mistral OCR API Usage
6
+
7
+ * **Endpoint:**
8
+ `POST https://api.mistral.ai/v1/ocr`
9
+
10
+ * **Headers:**
11
+
12
+ ```http
13
+ Authorization: Bearer YOUR_API_KEY
14
+ Content-Type: application/json
15
+ ```
16
+
17
+ * **Required JSON Body Fields:**
18
+
19
+ ```json
20
+ {
21
+ "file_url": "https://example.com/your.pdf"
22
+ }
23
+ ```
24
+
25
+ * **Expected Response Fields:**
26
+
27
+ * `text`: Raw OCR output
28
+ * `metadata`: Document structure, language, layout information
29
+
30
+ > **Note:** Always validate presence of required fields and handle error codes gracefully.
31
+
32
+ ---
33
+
34
+ ### 🖼️ Rule 2: Streamlit Usage Standards
35
+
36
+ * Use these core components:
37
+
38
+ * `st.file_uploader()`
39
+ * `st.selectbox()`
40
+ * `st.image()`
41
+ * `st.markdown()`
42
+ * `st.download_button()`
43
+
44
+ * Always set:
45
+ `use_container_width=True` for responsive display where supported
46
+
47
+ * Avoid global state; prefer `st.session_state` for interactivity and stateful inputs
48
+
49
+ ## Mistral OCR Examples
50
+
51
+ ``` json
52
+ {
53
+ "id": "string",
54
+ "object": "model",
55
+ "created": 0,{
56
+ "model": "string",
57
+ "id": "string",
58
+ "document": {
59
+ "document_url": "string",
60
+ "document_name": "string",
61
+ "type": "document_url"
62
+ },
63
+ "pages": [
64
+ 0
65
+ ],
66
+ "include_image_base64": true,
67
+ "image_limit": 0,
68
+ "image_min_size": 0
69
+ }
70
+ ```
71
+
72
+ ``` json
73
+ {
74
+ "pages": [
75
+ {
76
+ "index": 0,
77
+ "markdown": "string",
78
+ "images": [
79
+ {
80
+ "id": "string",
81
+ "top_left_x": 0,
82
+ "top_left_y": 0,
83
+ "bottom_right_x": 0,
84
+ "bottom_right_y": 0,
85
+ "image_base64": "string"
86
+ }
87
+ ],
88
+ "dimensions": {
89
+ "dpi": 0,
90
+ "height": 0,
91
+ "width": 0
92
+ }
93
+ }
94
+ ],
95
+ "model": "string",
96
+ "usage_info": {
97
+ "pages_processed": 0,
98
+ "doc_size_bytes": 0
99
+ }
100
+ }
101
+ ```
102
+
103
+ ### Links and Resources to Understand
104
+
105
+ * [URL to Mistral OCR APi doc](https://docs.mistral.ai/api/#tag/batch/operation/jobs_api_routes_batch_cancel_batch_job)
106
+ * [URL to Streamlit API documentation](https://docs.streamlit.io/develop/api-reference)
.clinerules/project-brief.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project Brief
2
+
3
+ Historical OCR is an advanced optical character recognition (OCR) application designed to support historical research. It leverages Mistral AI's OCR models alongside image preprocessing pipelines optimized for archival material.
4
+
5
+ High-Level Overview
6
+
7
+ Building a Streamlit-based web application to process historical documents (images or PDFs), optimize them for OCR using advanced preprocessing techniques, and extract structured text and metadata through Mistral's large language models.
8
+
9
+ Core Requirements and Goals
10
+
11
+ Upload and preprocess historical documents
12
+
13
+ Automatically detect document types (e.g., handwritten letters, scientific papers)
14
+
15
+ Apply tailored OCR prompting and structured output based on document type
16
+
17
+ Support user-defined contextual instructions to refine output
18
+
19
+ Provide downloadable structured transcripts and analysis
20
+
21
+ Example: "Building a Streamlit web app for OCR transcription and structured extraction from historical documents using Mistral AI."
.gitignore CHANGED
@@ -32,3 +32,4 @@ input/*.pdf
32
 
33
  # Temporary documents
34
  Tmplf6xnkgr*
 
 
32
 
33
  # Temporary documents
34
  Tmplf6xnkgr*
35
+ .env
app.py CHANGED
@@ -20,7 +20,7 @@ import streamlit as st
20
  # Local application/module imports
21
  from preprocessing import convert_pdf_to_images, preprocess_image
22
  from ocr_processing import process_file
23
- from ui_components import (
24
  ProgressReporter,
25
  create_sidebar_options,
26
  display_results,
 
20
  # Local application/module imports
21
  from preprocessing import convert_pdf_to_images, preprocess_image
22
  from ocr_processing import process_file
23
+ from ui.ui_components import (
24
  ProgressReporter,
25
  create_sidebar_options,
26
  display_results,
config.py CHANGED
@@ -17,39 +17,34 @@ load_dotenv()
17
  # Priority order:
18
  # 1. HF_API_KEY environment variable (Hugging Face standard)
19
  # 2. HUGGING_FACE_API_KEY environment variable (alternative name)
20
- # 3. MISTRAL_API_KEY environment variable (fallback)
21
- # 4. Empty string (will show warning in app)
 
22
 
23
  MISTRAL_API_KEY = os.environ.get("HF_API_KEY",
24
  os.environ.get("HUGGING_FACE_API_KEY",
25
- os.environ.get("MISTRAL_API_KEY", ""))).strip()
 
26
 
27
  if not MISTRAL_API_KEY:
28
  logger.warning("No Mistral API key found in environment variables. API functionality will be limited.")
29
 
30
  # Check if we're in test mode (allows operation without valid API key)
31
- # Set to False to use actual API calls
32
  TEST_MODE = False
33
 
34
- # Just check if API key exists
35
- if not MISTRAL_API_KEY and not TEST_MODE:
36
- logger.warning("No Mistral API key found. OCR functionality will not work unless TEST_MODE is enabled.")
37
-
38
- if TEST_MODE:
39
- logger.info("TEST_MODE is enabled. Using mock responses instead of actual API calls.")
40
-
41
  # Model settings with fallbacks
42
  OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
43
  TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
44
- VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") # Using faster model that supports vision
45
 
46
  # Image preprocessing settings optimized for historical documents
47
  # These can be customized from environment variables
48
  IMAGE_PREPROCESSING = {
49
- "enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "1.8")), # Increased contrast for better text recognition
50
  "sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
51
  "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
52
- "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")), # Increased size limit for better quality
53
  "target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
54
  "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "100")), # Higher quality for better OCR results
55
  # # Enhanced settings for handwritten documents
 
17
  # Priority order:
18
  # 1. HF_API_KEY environment variable (Hugging Face standard)
19
  # 2. HUGGING_FACE_API_KEY environment variable (alternative name)
20
+ # 3. HF_MISTRAL_API_KEY environment variable (for Hugging Face deployment)
21
+ # 4. MISTRAL_API_KEY environment variable (fallback)
22
+ # 5. Empty string (will show warning in app)
23
 
24
  MISTRAL_API_KEY = os.environ.get("HF_API_KEY",
25
  os.environ.get("HUGGING_FACE_API_KEY",
26
+ os.environ.get("HF_MISTRAL_API_KEY",
27
+ os.environ.get("MISTRAL_API_KEY", "")))).strip()
28
 
29
  if not MISTRAL_API_KEY:
30
  logger.warning("No Mistral API key found in environment variables. API functionality will be limited.")
31
 
32
  # Check if we're in test mode (allows operation without valid API key)
33
+ # Set to False to use actual API calls with Mistral API
34
  TEST_MODE = False
35
 
 
 
 
 
 
 
 
36
  # Model settings with fallbacks
37
  OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
38
  TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
39
+ VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") # faster model that supports vision
40
 
41
  # Image preprocessing settings optimized for historical documents
42
  # These can be customized from environment variables
43
  IMAGE_PREPROCESSING = {
44
+ "enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "3.5")), # Increased contrast for better text recognition
45
  "sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
46
  "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
47
+ "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "200.0")), # Increased size limit for better quality
48
  "target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
49
  "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "100")), # Higher quality for better OCR results
50
  # # Enhanced settings for handwritten documents
ocr_processing.py CHANGED
@@ -82,7 +82,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
82
 
83
  # Create a container for progress indicators if not provided
84
  if progress_reporter is None:
85
- from ui_components import ProgressReporter
86
  progress_reporter = ProgressReporter(st.empty()).setup()
87
 
88
  # Initialize temporary file paths list
@@ -119,10 +119,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
119
 
120
  # For PDFs, we need to handle differently
121
  if file_type == "pdf":
122
- progress_reporter.update(20, "Converting PDF to images...")
123
-
124
- # Process PDF with direct handling
125
- progress_reporter.update(30, "Processing PDF with OCR...")
126
 
127
  # Create a temporary file for processing
128
  temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
@@ -145,91 +142,98 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
145
  custom_prompt
146
  )
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  # Process with cached function if possible
149
  try:
150
- # Use the document type information from preprocessing options
151
- doc_type = preprocessing_options.get("document_type", "standard")
152
- modified_custom_prompt = custom_prompt
153
-
154
- # Add PDF-specific instructions
155
- if not modified_custom_prompt:
156
- modified_custom_prompt = "This is a multi-page PDF document."
157
- elif "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
158
- modified_custom_prompt += " This is a multi-page PDF document."
159
-
160
- # Update the cache key with the modified prompt
161
- if modified_custom_prompt != custom_prompt:
162
- cache_key = generate_cache_key(
163
- open(temp_path, 'rb').read(),
164
- file_type,
165
- use_vision,
166
- preprocessing_options,
167
- pdf_rotation,
168
- modified_custom_prompt
169
- )
170
-
171
- result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options), modified_custom_prompt)
172
  progress_reporter.update(90, "Finalizing results...")
173
  except Exception as e:
174
- logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
175
- progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
176
-
177
- # If caching fails, process directly
178
- processor = StructuredOCR()
179
-
180
 
181
- # Use the document type from preprocessing options
182
- doc_type = preprocessing_options.get("document_type", "standard")
183
- modified_custom_prompt = custom_prompt
184
-
185
- # Check for letterhead/marginalia document types with specialized handling
186
  try:
187
- from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
188
- # Extract text density features if available
189
- features = None
190
- if 'text_density' in preprocessing_options:
191
- features = preprocessing_options['text_density']
192
-
193
- # Check if this looks like a letterhead document
194
- if is_likely_letterhead(temp_path, features):
195
- # Get specialized letterhead prompt
196
- letterhead_prompt = get_letterhead_prompt(temp_path, features)
197
- if letterhead_prompt:
198
- logger.info(f"Using specialized letterhead prompt for document")
199
- modified_custom_prompt = letterhead_prompt
200
- # Set document type for tracking
201
- preprocessing_options["document_type"] = "letterhead"
202
- doc_type = "letterhead"
203
  except ImportError:
204
- logger.debug("Letterhead handler not available")
205
-
206
- # Add document-type specific instructions based on preprocessing options
207
- if doc_type == "handwritten" and not modified_custom_prompt:
208
- modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
209
- elif doc_type == "handwritten" and "handwritten" not in modified_custom_prompt.lower():
210
- modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
211
- elif doc_type == "newspaper" and not modified_custom_prompt:
212
- modified_custom_prompt = "This is a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
213
- elif doc_type == "newspaper" and "column" not in modified_custom_prompt.lower() and "newspaper" not in modified_custom_prompt.lower():
214
- modified_custom_prompt += " This appears to be a newspaper or document with columns. Please extract all text content from each column."
215
- elif doc_type == "book" and not modified_custom_prompt:
216
- modified_custom_prompt = "This is a book page. Extract titles, headers, footnotes, and body text, preserving paragraph structure and formatting."
217
-
218
- # Add PDF-specific instructions if needed
219
- if "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
220
- modified_custom_prompt += " This is a multi-page PDF document."
221
-
222
- # Process directly with optimized settings
223
- result = processor.process_file(
224
- file_path=temp_path,
225
- file_type="pdf",
226
- use_vision=use_vision,
227
- custom_prompt=modified_custom_prompt,
228
- file_size_mb=file_size_mb,
229
- pdf_rotation=pdf_rotation
230
- )
231
-
232
- progress_reporter.update(90, "Finalizing results...")
233
  else:
234
  # For image files
235
  progress_reporter.update(20, "Preparing image for processing...")
@@ -390,7 +394,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
390
 
391
  # Check for letterhead/marginalia document types with specialized handling
392
  try:
393
- from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
394
  # Extract text density features if available
395
  features = None
396
  if 'text_density' in preprocessing_options:
@@ -453,7 +457,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
453
 
454
  # Check for letterhead/marginalia document types with specialized handling
455
  try:
456
- from letterhead_handler import get_letterhead_prompt, is_likely_letterhead
457
  # Extract text density features if available
458
  features = None
459
  if 'text_density' in preprocessing_options:
@@ -503,7 +507,7 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
503
 
504
  # Check for duplicated text patterns that indicate handwritten text issues
505
  try:
506
- from ocr_text_repair import detect_duplicate_text_issues, get_enhanced_preprocessing_options, get_handwritten_specific_prompt, clean_duplicated_text
507
 
508
  # Check OCR output for duplication issues
509
  if result and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
 
82
 
83
  # Create a container for progress indicators if not provided
84
  if progress_reporter is None:
85
+ from ui.ui_components import ProgressReporter
86
  progress_reporter = ProgressReporter(st.empty()).setup()
87
 
88
  # Initialize temporary file paths list
 
119
 
120
  # For PDFs, we need to handle differently
121
  if file_type == "pdf":
122
+ progress_reporter.update(20, "Preparing PDF document...")
 
 
 
123
 
124
  # Create a temporary file for processing
125
  temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
 
142
  custom_prompt
143
  )
144
 
145
+ # Use the document type information from preprocessing options
146
+ doc_type = preprocessing_options.get("document_type", "standard")
147
+ modified_custom_prompt = custom_prompt
148
+
149
+ # Enhance the prompt with document-type specific instructions
150
+ # Check for letterhead/marginalia document types with specialized handling
151
+ try:
152
+ from utils.helpers.letterhead_handler import get_letterhead_prompt, is_likely_letterhead
153
+ # Extract text density features if available
154
+ features = None
155
+ if 'text_density' in preprocessing_options:
156
+ features = preprocessing_options['text_density']
157
+
158
+ # Check if this looks like a letterhead document
159
+ if is_likely_letterhead(temp_path, features):
160
+ # Get specialized letterhead prompt
161
+ letterhead_prompt = get_letterhead_prompt(temp_path, features)
162
+ if letterhead_prompt:
163
+ logger.info(f"Using specialized letterhead prompt for document")
164
+ modified_custom_prompt = letterhead_prompt
165
+ # Set document type for tracking
166
+ preprocessing_options["document_type"] = "letterhead"
167
+ doc_type = "letterhead"
168
+ except ImportError:
169
+ logger.debug("Letterhead handler not available")
170
+
171
+ # Add document-type specific instructions based on preprocessing options
172
+ if doc_type == "handwritten" and not modified_custom_prompt:
173
+ modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
174
+ elif doc_type == "handwritten" and "handwritten" not in modified_custom_prompt.lower():
175
+ modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
176
+ elif doc_type == "newspaper" and not modified_custom_prompt:
177
+ modified_custom_prompt = "This is a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
178
+ elif doc_type == "newspaper" and "column" not in modified_custom_prompt.lower() and "newspaper" not in modified_custom_prompt.lower():
179
+ modified_custom_prompt += " This appears to be a newspaper or document with columns. Please extract all text content from each column."
180
+ elif doc_type == "book" and not modified_custom_prompt:
181
+ modified_custom_prompt = "This is a book page. Extract titles, headers, footnotes, and body text, preserving paragraph structure and formatting."
182
+
183
+ # Update the cache key with the modified prompt
184
+ if modified_custom_prompt != custom_prompt:
185
+ cache_key = generate_cache_key(
186
+ open(temp_path, 'rb').read(),
187
+ file_type,
188
+ use_vision,
189
+ preprocessing_options,
190
+ pdf_rotation,
191
+ modified_custom_prompt
192
+ )
193
+
194
+ progress_reporter.update(30, "Processing PDF with enhanced OCR...")
195
+
196
  # Process with cached function if possible
197
  try:
198
+ result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key,
199
+ str(preprocessing_options), modified_custom_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  progress_reporter.update(90, "Finalizing results...")
201
  except Exception as e:
202
+ logger.warning(f"Cached processing failed: {str(e)}. Using direct processing.")
203
+ progress_reporter.update(60, f"Processing error: {str(e)}. Using enhanced PDF processor...")
 
 
 
 
204
 
205
+ # Import the enhanced PDF processor
 
 
 
 
206
  try:
207
+ from utils.pdf_ocr import PDFOCR
208
+
209
+ # Use our specialized PDF processor
210
+ pdf_processor = PDFOCR()
211
+
212
+ # Process with the enhanced PDF processor
213
+ result = pdf_processor.process_pdf(
214
+ pdf_path=temp_path,
215
+ use_vision=use_vision,
216
+ max_pages=max_pages,
217
+ custom_prompt=modified_custom_prompt
218
+ )
219
+
220
+ logger.info("PDF successfully processed with enhanced PDF processor")
221
+ progress_reporter.update(90, "Finalizing results...")
 
222
  except ImportError:
223
+ logger.warning("Enhanced PDF processor not available. Falling back to standard processing.")
224
+ progress_reporter.update(70, "Falling back to standard PDF processing...")
225
+
226
+ # If enhanced processor is not available, fall back to direct StructuredOCR processing
227
+ processor = StructuredOCR()
228
+ result = processor.process_file(
229
+ file_path=temp_path,
230
+ file_type="pdf",
231
+ use_vision=use_vision,
232
+ custom_prompt=modified_custom_prompt,
233
+ file_size_mb=file_size_mb,
234
+ max_pages=max_pages
235
+ )
236
+ progress_reporter.update(90, "Finalizing results...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  else:
238
  # For image files
239
  progress_reporter.update(20, "Preparing image for processing...")
 
394
 
395
  # Check for letterhead/marginalia document types with specialized handling
396
  try:
397
+ from utils.helpers.letterhead_handler import get_letterhead_prompt, is_likely_letterhead
398
  # Extract text density features if available
399
  features = None
400
  if 'text_density' in preprocessing_options:
 
457
 
458
  # Check for letterhead/marginalia document types with specialized handling
459
  try:
460
+ from utils.helpers.letterhead_handler import get_letterhead_prompt, is_likely_letterhead
461
  # Extract text density features if available
462
  features = None
463
  if 'text_density' in preprocessing_options:
 
507
 
508
  # Check for duplicated text patterns that indicate handwritten text issues
509
  try:
510
+ from utils.helpers.ocr_text_repair import detect_duplicate_text_issues, get_enhanced_preprocessing_options, get_handwritten_specific_prompt, clean_duplicated_text
511
 
512
  # Check OCR output for duplication issues
513
  if result and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
requirements.txt CHANGED
@@ -9,7 +9,7 @@ pydantic>=2.5.0 # Updated for better BaseModel support
9
  Pillow>=10.0.0
10
  opencv-python-headless>=4.8.0.74
11
  pdf2image>=1.16.0
12
- # pytesseract>=0.3.10 # For local OCR fallback
13
  matplotlib>=3.7.0 # For visualization in preprocessing tests
14
 
15
  # Data handling and utilities
 
9
  Pillow>=10.0.0
10
  opencv-python-headless>=4.8.0.74
11
  pdf2image>=1.16.0
12
+ pytesseract>=0.3.10 # For local OCR fallback
13
  matplotlib>=3.7.0 # For visualization in preprocessing tests
14
 
15
  # Data handling and utilities
structured_ocr.py CHANGED
The diff for this file is too large to render. See raw diff
 
ui/ui_components.py ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import io
4
+ import base64
5
+ import logging
6
+ import re
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ import json
10
+
11
+ # Define exports
12
+ __all__ = [
13
+ 'ProgressReporter',
14
+ 'create_sidebar_options',
15
+ 'create_file_uploader',
16
+ 'display_document_with_images',
17
+ 'display_previous_results',
18
+ 'display_about_tab',
19
+ 'display_results' # Re-export from utils.ui_utils
20
+ ]
21
+ from constants import (
22
+ DOCUMENT_TYPES,
23
+ DOCUMENT_LAYOUTS,
24
+ CUSTOM_PROMPT_TEMPLATES,
25
+ LAYOUT_PROMPT_ADDITIONS,
26
+ DEFAULT_PDF_DPI,
27
+ MIN_PDF_DPI,
28
+ MAX_PDF_DPI,
29
+ DEFAULT_MAX_PAGES,
30
+ PERFORMANCE_MODES,
31
+ PREPROCESSING_DOC_TYPES,
32
+ ROTATION_OPTIONS
33
+ )
34
+ from utils.text_utils import format_ocr_text, clean_raw_text, format_markdown_text # Import from text_utils
35
+ from utils.content_utils import (
36
+ classify_document_content,
37
+ extract_document_text,
38
+ extract_image_description
39
+ )
40
+ from utils.ui_utils import display_results
41
+ from preprocessing import preprocess_image
42
+
43
+ class ProgressReporter:
44
+ """Class to handle progress reporting in the UI"""
45
+
46
+ def __init__(self, placeholder):
47
+ self.placeholder = placeholder
48
+ self.progress_bar = None
49
+ self.status_text = None
50
+
51
+ def setup(self):
52
+ """Setup the progress components"""
53
+ with self.placeholder.container():
54
+ self.progress_bar = st.progress(0)
55
+ self.status_text = st.empty()
56
+ return self
57
+
58
+ def update(self, percent, status_text):
59
+ """Update the progress bar and status text"""
60
+ if self.progress_bar is not None:
61
+ self.progress_bar.progress(percent / 100)
62
+ if self.status_text is not None:
63
+ self.status_text.text(status_text)
64
+
65
+ def complete(self, success=True):
66
+ """Complete the progress reporting"""
67
+ if success:
68
+ if self.progress_bar is not None:
69
+ self.progress_bar.progress(100)
70
+ if self.status_text is not None:
71
+ self.status_text.text("Processing complete!")
72
+ else:
73
+ if self.status_text is not None:
74
+ self.status_text.text("Processing failed.")
75
+
76
+ # Clear the progress components after a delay
77
+ import time
78
+ time.sleep(0.8) # Short delay to show completion
79
+ if self.progress_bar is not None:
80
+ self.progress_bar.empty()
81
+ if self.status_text is not None:
82
+ self.status_text.empty()
83
+
84
+ def create_sidebar_options():
85
+ """Create and return sidebar options"""
86
+ with st.sidebar:
87
+ st.markdown("## OCR Settings")
88
+
89
+ # Create a container for the sidebar options
90
+ with st.container():
91
+ # Default to using vision model (removed selection from UI)
92
+ use_vision = True
93
+
94
+ # Document type selection
95
+ doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
96
+ help="Select the type of document you're processing for better results")
97
+
98
+ # Document layout
99
+ doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS,
100
+ help="Select the layout of your document")
101
+
102
+ # Initialize preprocessing variables with default values
103
+ grayscale = False
104
+ denoise = False
105
+ contrast = 0
106
+ rotation = 0
107
+ use_segmentation = False
108
+
109
+ # Custom prompt
110
+ custom_prompt = ""
111
+ # Get the template for the selected document type if not auto-detect
112
+ if doc_type != DOCUMENT_TYPES[0]:
113
+ prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "")
114
+
115
+ # Add layout information if not standard
116
+ if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout
117
+ layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "")
118
+ if layout_addition:
119
+ prompt_template += " " + layout_addition
120
+
121
+ # Set the custom prompt
122
+ custom_prompt = prompt_template
123
+
124
+ # Allow user to edit the prompt (always visible)
125
+ custom_prompt = st.text_area("Custom Processing Instructions", value=custom_prompt,
126
+ help="Customize the instructions for processing this document",
127
+ height=80)
128
+
129
+ # Image preprocessing options (always visible)
130
+ st.markdown("### Image Preprocessing")
131
+
132
+ # Grayscale conversion
133
+ grayscale = st.checkbox("Convert to Grayscale",
134
+ value=True,
135
+ help="Convert color images to grayscale for better text recognition")
136
+
137
+ # Light denoising option
138
+ denoise = st.checkbox("Light Denoising",
139
+ value=True,
140
+ help="Apply gentle denoising to improve text clarity")
141
+
142
+ # Contrast adjustment
143
+ contrast = st.slider("Contrast Adjustment",
144
+ min_value=-20,
145
+ max_value=20,
146
+ value=5,
147
+ step=5,
148
+ help="Adjust image contrast (limited range)")
149
+
150
+
151
+ # Initialize rotation (keeping it set to 0)
152
+ rotation = 0
153
+ use_segmentation = False
154
+
155
+ # Create preprocessing options dictionary
156
+ # Map UI document types to preprocessing document types
157
+ doc_type_for_preprocessing = "standard"
158
+ if "Handwritten" in doc_type:
159
+ doc_type_for_preprocessing = "handwritten"
160
+ elif "Newspaper" in doc_type or "Magazine" in doc_type:
161
+ doc_type_for_preprocessing = "newspaper"
162
+ elif "Book" in doc_type or "Publication" in doc_type:
163
+ doc_type_for_preprocessing = "book" # Match the actual preprocessing type
164
+
165
+ preprocessing_options = {
166
+ "document_type": doc_type_for_preprocessing,
167
+ "grayscale": grayscale,
168
+ "denoise": denoise,
169
+ "contrast": contrast,
170
+ "rotation": rotation
171
+ }
172
+
173
+ # PDF-specific options
174
+ st.markdown("### PDF Options")
175
+ max_pages = st.number_input("Maximum Pages to Process",
176
+ min_value=1,
177
+ max_value=20,
178
+ value=DEFAULT_MAX_PAGES,
179
+ help="Limit the number of pages to process (for multi-page PDFs)")
180
+
181
+ # Set default values for removed options
182
+ pdf_dpi = DEFAULT_PDF_DPI
183
+ pdf_rotation = 0
184
+
185
+ # Create options dictionary
186
+ options = {
187
+ "use_vision": use_vision,
188
+ "perf_mode": "Quality", # Default to Quality, removed performance mode option
189
+ "pdf_dpi": pdf_dpi,
190
+ "max_pages": max_pages,
191
+ "pdf_rotation": pdf_rotation,
192
+ "custom_prompt": custom_prompt,
193
+ "preprocessing_options": preprocessing_options,
194
+ "use_segmentation": use_segmentation if 'use_segmentation' in locals() else False
195
+ }
196
+
197
+ return options
198
+
199
+ def create_file_uploader():
200
+ """Create and return a file uploader"""
201
+ # Add app description
202
+ st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><div style="font-size: 32px;">📜</div><div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical OCR</h2></div></div>', unsafe_allow_html=True)
203
+ st.markdown("<p style='font-size: 0.8em; color: #666; text-align: left;'>Made possible by Mistral AI</p>", unsafe_allow_html=True)
204
+
205
+ # Add project framing
206
+ st.markdown("""
207
+ This tool assists scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy, it helps navigate:
208
+ - **Historical newspapers** with complex layouts
209
+ - **Handwritten documents** from various periods
210
+ - **Photos of archival materials**
211
+
212
+ Upload a document to begin, or explore the examples.
213
+ """)
214
+
215
+ # Create file uploader with a more concise label
216
+ uploaded_file = st.file_uploader(
217
+ "Select file",
218
+ type=["pdf", "png", "jpg"],
219
+ help="Upload a PDF or image file for OCR processing"
220
+ )
221
+ return uploaded_file
222
+
223
+ def display_document_with_images(result):
224
+ """Display document with images"""
225
+ # Check for pages_data first
226
+ if 'pages_data' in result and result['pages_data']:
227
+ pages_data = result['pages_data']
228
+ # If pages_data not available, try to extract from raw_response_data
229
+ elif 'raw_response_data' in result and isinstance(result['raw_response_data'], dict) and 'pages' in result['raw_response_data']:
230
+ # Build pages_data from raw_response_data
231
+ pages_data = []
232
+ raw_pages = result['raw_response_data']['pages']
233
+
234
+ for page_idx, page in enumerate(raw_pages):
235
+ if not isinstance(page, dict):
236
+ continue
237
+
238
+ page_data = {
239
+ 'page_number': page_idx + 1,
240
+ 'markdown': page.get('markdown', ''),
241
+ 'images': []
242
+ }
243
+
244
+ # Extract images if present
245
+ if 'images' in page and isinstance(page['images'], list):
246
+ for img_idx, img in enumerate(page['images']):
247
+ if isinstance(img, dict) and ('base64' in img or 'image_base64' in img):
248
+ img_base64 = img.get('image_base64', img.get('base64', ''))
249
+ if img_base64:
250
+ page_data['images'].append({
251
+ 'id': img.get('id', f"img_{page_idx}_{img_idx}"),
252
+ 'image_base64': img_base64
253
+ })
254
+
255
+ if page_data['markdown'] or page_data['images']:
256
+ pages_data.append(page_data)
257
+ else:
258
+ st.info("No image data available.")
259
+ return
260
+
261
+ # Display each page
262
+ for i, page_data in enumerate(pages_data):
263
+ st.markdown(f"### Page {i+1}")
264
+
265
+ # Display only the image (removed text column)
266
+ # Display the image - check multiple possible field names
267
+ image_displayed = False
268
+
269
+ # Try 'image_data' field first
270
+ if 'image_data' in page_data:
271
+ try:
272
+ # Convert base64 to image
273
+ image_data = base64.b64decode(page_data['image_data'])
274
+ st.image(io.BytesIO(image_data), use_container_width=True)
275
+ image_displayed = True
276
+ except Exception as e:
277
+ st.error(f"Error displaying image from image_data: {str(e)}")
278
+
279
+ # Try 'images' array if image_data didn't work
280
+ if not image_displayed and 'images' in page_data and len(page_data['images']) > 0:
281
+ for img in page_data['images']:
282
+ if 'image_base64' in img:
283
+ try:
284
+ st.image(img['image_base64'], use_container_width=True)
285
+ image_displayed = True
286
+ break
287
+ except Exception as e:
288
+ st.error(f"Error displaying image from images array: {str(e)}")
289
+
290
+ # Try alternative image source if still not displayed
291
+ if not image_displayed and 'raw_response_data' in result:
292
+ raw_data = result['raw_response_data']
293
+ if isinstance(raw_data, dict) and 'pages' in raw_data:
294
+ for raw_page in raw_data['pages']:
295
+ if isinstance(raw_page, dict) and 'images' in raw_page:
296
+ for img in raw_page['images']:
297
+ if isinstance(img, dict) and 'base64' in img:
298
+ st.image(img['base64'], use_container_width=True)
299
+ st.caption("Image from OCR response")
300
+ image_displayed = True
301
+ break
302
+ if image_displayed:
303
+ break
304
+
305
+ if not image_displayed:
306
+ st.info("No image available for this page.")
307
+
308
+ # Extract and display alt text if available
309
+ page_text = ""
310
+ if 'text' in page_data:
311
+ page_text = page_data['text']
312
+ elif 'markdown' in page_data:
313
+ page_text = page_data['markdown']
314
+
315
+ if page_text and page_text.startswith("![") and page_text.endswith(")"):
316
+ try:
317
+ alt_text = page_text[2:page_text.index(']')]
318
+ if alt_text and len(alt_text) > 5: # Only show if alt text is meaningful
319
+ st.caption(f"Image description: {alt_text}")
320
+ except:
321
+ pass
322
+
323
+ def display_previous_results():
324
+ """Display previous results tab content in a simplified, structured view"""
325
+
326
+ # Use a simple header without the button column
327
+ st.header("Previous Results")
328
+
329
+ # Display previous results if available
330
+ if not st.session_state.previous_results:
331
+ st.markdown("""
332
+ <div style="text-align: center; padding: 30px 20px; background-color: #f8f9fa; border-radius: 6px; margin-top: 10px;">
333
+ <div style="font-size: 36px; margin-bottom: 15px;">📄</div>
334
+ <h3="margin-bottom: 16px; font-weight: 500;">No Previous Results</h3>
335
+ <p style="font-size: 14px; color: #666;">Process a document to see your results history.</p>
336
+ </div>
337
+ """, unsafe_allow_html=True)
338
+ else:
339
+ # Prepare zip download outside of the UI flow
340
+ try:
341
+ # Create download button for all results
342
+ from utils.image_utils import create_results_zip_in_memory
343
+ zip_data = create_results_zip_in_memory(st.session_state.previous_results)
344
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
345
+
346
+ # Simplified filename
347
+ zip_filename = f"ocr_results_{timestamp}.zip"
348
+
349
+ # Encode the zip data for direct download link
350
+ zip_b64 = base64.b64encode(zip_data).decode()
351
+
352
+ # Add styled download tag in the metadata section
353
+ download_html = '<div style="display: flex; align-items: center; margin: 0.5rem 0; flex-wrap: wrap;">'
354
+ download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>'
355
+ download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">All Results</a>'
356
+ download_html += '</div>'
357
+ st.markdown(download_html, unsafe_allow_html=True)
358
+ except Exception:
359
+ # Silent fail - no error message to keep UI clean
360
+ pass
361
+
362
+ # Create a cleaner, more minimal grid for results using Streamlit columns
363
+ # Calculate number of columns based on screen width - more responsive
364
+ num_columns = 2 # Two columns for most screens
365
+
366
+ # Create rows of result cards
367
+ for i in range(0, len(st.session_state.previous_results), num_columns):
368
+ # Create a row of columns
369
+ cols = st.columns(num_columns)
370
+
371
+ # Fill each column with a result card
372
+ for j in range(num_columns):
373
+ index = i + j
374
+ if index < len(st.session_state.previous_results):
375
+ result = st.session_state.previous_results[index]
376
+
377
+ # Get basic info for the card
378
+ file_name = result.get("file_name", f"Document {index+1}")
379
+ timestamp = result.get("timestamp", "")
380
+
381
+ # Determine file type icon
382
+ if file_name.lower().endswith(".pdf"):
383
+ icon = "📄"
384
+ elif any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif"]):
385
+ icon = "🖼️"
386
+ else:
387
+ icon = "📝"
388
+
389
+ # Display a simplified card in each column
390
+ with cols[j]:
391
+ # Use a container for better styling control
392
+ with st.container():
393
+ # Create visually cleaner card with less vertical space
394
+ st.markdown(f"""
395
+ <div style="padding: 10px; border: 1px solid #e0e0e0; border-radius: 6px; margin-bottom: 10px;">
396
+ <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 5px;">
397
+ <div style="font-weight: 500; font-size: 14px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">{icon} {file_name}</div>
398
+ <div style="color: #666; font-size: 12px;">{timestamp.split()[0] if timestamp else ""}</div>
399
+ </div>
400
+ </div>
401
+ """, unsafe_allow_html=True)
402
+
403
+ # Add a simple button below each card
404
+ if st.button(f"View", key=f"view_{index}", help=f"View {file_name}"):
405
+ st.session_state.selected_previous_result = st.session_state.previous_results[index]
406
+ st.rerun()
407
+
408
+ # Display the selected result if available
409
+ if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
410
+ selected_result = st.session_state.selected_previous_result
411
+
412
+ # Draw a separator between results list and selected document
413
+ st.markdown("<hr style='margin: 20px 0 15px 0; border: none; height: 1px; background-color: #eee;'>", unsafe_allow_html=True)
414
+
415
+ # Create a cleaner header for the selected document
416
+ file_name = selected_result.get('file_name', 'Document')
417
+ st.subheader(f"{file_name}")
418
+
419
+ # Add a simple back button at the top
420
+ if st.button("← Back to Results", key="back_to_results"):
421
+ if 'selected_previous_result' in st.session_state:
422
+ del st.session_state.selected_previous_result
423
+ st.session_state.perform_reset = True
424
+ st.rerun()
425
+
426
+ # Simplified metadata display - just one line with essential info
427
+ meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 12px; margin: 8px 0 15px 0; font-size: 14px; color: #666;">'
428
+
429
+ # Add timestamp
430
+ if 'timestamp' in selected_result:
431
+ meta_html += f'<div>{selected_result["timestamp"]}</div>'
432
+
433
+ # Add languages if available (simplified)
434
+ if 'languages' in selected_result and selected_result['languages']:
435
+ languages = [lang for lang in selected_result['languages'] if lang is not None]
436
+ if languages:
437
+ meta_html += f'<div>Language: {", ".join(languages)}</div>'
438
+
439
+ # Add page count if available (simplified)
440
+ if 'limited_pages' in selected_result:
441
+ meta_html += f'<div>Pages: {selected_result["limited_pages"]["processed"]}/{selected_result["limited_pages"]["total"]}</div>'
442
+
443
+ meta_html += '</div>'
444
+ st.markdown(meta_html, unsafe_allow_html=True)
445
+
446
+ # Simplified tabs - using the same format as main view
447
+ has_images = selected_result.get('has_images', False)
448
+ if has_images:
449
+ view_tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
450
+ view_tab1, view_tab2, view_tab3 = view_tabs
451
+ else:
452
+ view_tabs = st.tabs(["Document Content", "Raw JSON"])
453
+ view_tab1, view_tab2 = view_tabs
454
+ view_tab3 = None
455
+
456
+ # First tab - Document Content (simplified structured view)
457
+ with view_tab1:
458
+ # Display content in a cleaner, more streamlined format
459
+ if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
460
+ # Create a more focused list of important sections
461
+ priority_sections = ["title", "content", "transcript", "summary"]
462
+ displayed_sections = set()
463
+
464
+ # First display priority sections
465
+ for section in priority_sections:
466
+ if section in selected_result['ocr_contents'] and selected_result['ocr_contents'][section]:
467
+ content = selected_result['ocr_contents'][section]
468
+ if isinstance(content, str) and content.strip():
469
+ # Only add a subheader for meaningful section names, not raw_text
470
+ if section != "raw_text":
471
+ st.markdown(f"##### {section.replace('_', ' ').title()}")
472
+
473
+ # Format and display content
474
+ formatted_content = format_ocr_text(content, for_display=True)
475
+ st.markdown(formatted_content)
476
+ displayed_sections.add(section)
477
+
478
+ # Then display any remaining sections not already shown
479
+ for section, content in selected_result['ocr_contents'].items():
480
+ if (section not in displayed_sections and
481
+ section not in ['error', 'partial_text'] and
482
+ content):
483
+ st.markdown(f"##### {section.replace('_', ' ').title()}")
484
+
485
+ if isinstance(content, str):
486
+ st.markdown(format_ocr_text(content, for_display=True))
487
+ elif isinstance(content, list):
488
+ for item in content:
489
+ st.markdown(f"- {item}")
490
+ elif isinstance(content, dict):
491
+ for k, v in content.items():
492
+ st.markdown(f"**{k}:** {v}")
493
+
494
+ # Second tab - Raw JSON (simplified)
495
+ with view_tab2:
496
+ # Extract the relevant JSON data
497
+ json_data = {}
498
+
499
+ # Include important metadata
500
+ for field in ['file_name', 'timestamp', 'processing_time', 'title', 'languages', 'topics', 'subjects', 'text',' raw_text']:
501
+ if field in selected_result:
502
+ json_data[field] = selected_result[field]
503
+
504
+ # Include OCR contents
505
+ if 'ocr_contents' in selected_result:
506
+ json_data['ocr_contents'] = selected_result['ocr_contents']
507
+
508
+ # Format the JSON prettily
509
+ json_str = json.dumps(json_data, indent=2)
510
+
511
+ # Display in a monospace font with syntax highlighting
512
+ st.code(json_str, language="json")
513
+
514
+ # Third tab - Images (simplified)
515
+ if has_images and view_tab3 is not None:
516
+ with view_tab3:
517
+ # Simplified image display
518
+ if 'pages_data' in selected_result:
519
+ for i, page_data in enumerate(selected_result['pages_data']):
520
+ # Display each page
521
+ if 'images' in page_data and len(page_data['images']) > 0:
522
+ for img in page_data['images']:
523
+ if 'image_base64' in img:
524
+ st.image(img['image_base64'], use_container_width=True)
525
+
526
+ # Get page text if available
527
+ page_text = ""
528
+ if 'markdown' in page_data:
529
+ page_text = page_data['markdown']
530
+
531
+ # Display text if available
532
+ if page_text:
533
+ with st.expander(f"Page {i+1} Text", expanded=False):
534
+ st.text(page_text)
535
+
536
+ def display_about_tab():
537
+ """Display learn more tab content"""
538
+ st.header("Learn More")
539
+
540
+ # Add app description
541
+ st.markdown("""
542
+ **Historical OCR** is a tailored academic tool for extracting text from historical documents, manuscripts, and printed materials.
543
+ """)
544
+
545
+ # Purpose section with consistent formatting
546
+ st.markdown("### Purpose")
547
+ st.markdown("""
548
+ This tool is designed to assist scholars in historical research by extracting text from challenging documents.
549
+ While it may not achieve full accuracy for all materials, it serves as a tailored research aid for navigating
550
+ historical documents, particularly:
551
+ """)
552
+
553
+ st.markdown("""
554
+ - **Historical newspapers** with complex layouts and aged text
555
+ - **Handwritten documents** from various time periods
556
+ - **Photos of archival materials** that may be difficult to read
557
+ """)
558
+
559
+ # Features section with consistent formatting
560
+ st.markdown("### Features")
561
+ st.markdown("""
562
+ - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
563
+ - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
564
+ - **Editable Results**: Review and edit extracted text directly in the interface
565
+ - **Structured Content Analysis**: Automatic organization of document content
566
+ - **Multi-language Support**: Process documents in various languages
567
+ - **PDF Processing**: Handle multi-page historical documents
568
+ """)
569
+
570
+ # How to Use section with consistent formatting
571
+ st.markdown("### How to Use")
572
+ st.markdown("""
573
+ 1. Upload a document (PDF or image)
574
+ 2. Select the document type and adjust preprocessing options if needed
575
+ 3. Add custom processing instructions for specialized documents
576
+ 4. Process the document
577
+ 5. Review, edit, and download the results
578
+ """)
579
+
580
+ # Technologies section with consistent formatting
581
+ st.markdown("### Technologies")
582
+ st.markdown("""
583
+ - OCR processing using Mistral AI's advanced document understanding capabilities
584
+ - Image preprocessing with OpenCV
585
+ - PDF handling with pdf2image
586
+ - Web interface with Streamlit
587
+ """)
588
+
589
+ # Add version information
590
+ st.markdown("**Version:** 2.0.0")
utils/helpers/language_detection.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard library imports
2
+ import logging
3
+ import re
4
+ from typing import List, Dict, Set, Tuple, Optional, Union, Any
5
+ from functools import lru_cache
6
+
7
+ # Configure logging
8
+ logging.basicConfig(level=logging.INFO,
9
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class LanguageDetector:
13
+ """
14
+ A language detection system that provides balanced detection across multiple languages
15
+ using an enhanced statistical approach.
16
+ """
17
+
18
+ def __init__(self):
19
+ """Initialize the language detector with statistical language models"""
20
+ logger.info("Initializing language detector with statistical models")
21
+
22
+ # Initialize language indicators dictionary for statistical detection
23
+ self._init_language_indicators()
24
+ # Set thresholds for language detection confidence
25
+ self.single_lang_confidence = 65 # Minimum score to consider a language detected
26
+ self.secondary_lang_threshold = 0.75 # Secondary language must be at least this fraction of primary score
27
+
28
+ def _init_language_indicators(self):
29
+ """Initialize language indicators for statistical detection with historical markers"""
30
+ # Define indicators for all supported languages with equal detail level
31
+ # Each language has:
32
+ # - Distinctive characters
33
+ # - Common words (including historical forms)
34
+ # - N-grams (character sequences)
35
+ # - Historical markers specific to older forms of the language
36
+ self.language_indicators = {
37
+ "English": {
38
+ "chars": [], # English uses basic Latin alphabet without special chars
39
+ "words": ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it',
40
+ 'with', 'as', 'be', 'on', 'by', 'at', 'this', 'have', 'from', 'or',
41
+ 'an', 'but', 'not', 'what', 'all', 'were', 'when', 'we', 'there', 'can',
42
+ 'would', 'who', 'you', 'been', 'one', 'their', 'has', 'more', 'if', 'no'],
43
+ "ngrams": ['th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd', 'ti', 'es', 'or',
44
+ 'ing', 'tion', 'the', 'and', 'tha', 'ent', 'ion'],
45
+ "historical": {
46
+ "chars": ['þ', 'ȝ', 'æ', 'ſ'], # Thorn, yogh, ash, long s
47
+ "words": ['thou', 'thee', 'thy', 'thine', 'hath', 'doth', 'ere', 'whilom', 'betwixt',
48
+ 'ye', 'art', 'wast', 'dost', 'hast', 'shalt', 'mayst', 'verily'],
49
+ "patterns": ['eth$', '^y[^a-z]', 'ck$', 'aught', 'ought'] # -eth endings, y- prefixes
50
+ }
51
+ },
52
+ "French": {
53
+ "chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û', 'ë', 'ï', 'ü'],
54
+ "words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une', 'ce', 'cette',
55
+ 'ces', 'dans', 'par', 'pour', 'sur', 'qui', 'que', 'quoi', 'où', 'quand', 'comment',
56
+ 'est', 'sont', 'ont', 'nous', 'vous', 'ils', 'elles', 'avec', 'sans', 'mais', 'ou'],
57
+ "ngrams": ['es', 'le', 'de', 'en', 'on', 'nt', 'qu', 'ai', 'an', 'ou', 'ur', 're', 'me',
58
+ 'les', 'ent', 'que', 'des', 'ons', 'ant', 'ion'],
59
+ "historical": {
60
+ "chars": ['ſ', 'æ', 'œ'], # Long s and ligatures
61
+ "words": ['aultre', 'avecq', 'icelluy', 'oncques', 'moult', 'estre', 'mesme', 'ceste',
62
+ 'ledict', 'celuy', 'ceulx', 'aulcun', 'ainſi', 'touſiours', 'eſtre',
63
+ 'eſt', 'meſme', 'felon', 'auec', 'iufques', 'chofe', 'fcience'],
64
+ "patterns": ['oi[ts]$', 'oi[re]$', 'f[^aeiou]', 'ff', 'ſ', 'auoit', 'eſtoit',
65
+ 'ſi', 'ſur', 'ſa', 'cy', 'ayant', 'oy', 'uſ', 'auſ']
66
+ },
67
+ },
68
+ "German": {
69
+ "chars": ['ä', 'ö', 'ü', 'ß'],
70
+ "words": ['der', 'die', 'das', 'und', 'in', 'zu', 'den', 'ein', 'eine', 'mit', 'ist', 'von',
71
+ 'des', 'sich', 'auf', 'für', 'als', 'auch', 'werden', 'bei', 'durch', 'aus', 'sind',
72
+ 'nicht', 'nur', 'wurde', 'wie', 'wenn', 'aber', 'noch', 'nach', 'so', 'sein', 'über'],
73
+ "ngrams": ['en', 'er', 'ch', 'de', 'ei', 'in', 'te', 'nd', 'ie', 'ge', 'un', 'sch', 'ich',
74
+ 'den', 'die', 'und', 'der', 'ein', 'ung', 'cht'],
75
+ "historical": {
76
+ "chars": ['ſ', 'ů', 'ė', 'ÿ'],
77
+ "words": ['vnnd', 'vnnd', 'vnter', 'vnd', 'seyn', 'thun', 'auff', 'auß', 'deß', 'diß'],
78
+ "patterns": ['^v[nd]', 'th', 'vnter', 'ſch']
79
+ }
80
+ },
81
+ "Spanish": {
82
+ "chars": ['á', 'é', 'í', 'ó', 'ú', 'ñ', 'ü', '¿', '¡'],
83
+ "words": ['el', 'la', 'los', 'las', 'de', 'en', 'y', 'a', 'que', 'por', 'un', 'una', 'no',
84
+ 'es', 'con', 'para', 'su', 'al', 'se', 'del', 'como', 'más', 'pero', 'lo', 'mi',
85
+ 'si', 'ya', 'todo', 'esta', 'cuando', 'hay', 'muy', 'bien', 'sin', 'así'],
86
+ "ngrams": ['de', 'en', 'os', 'es', 'la', 'ar', 'el', 'er', 'ra', 'as', 'an', 'do', 'or',
87
+ 'que', 'nte', 'los', 'ado', 'con', 'ent', 'ien'],
88
+ "historical": {
89
+ "chars": ['ſ', 'ç', 'ñ'],
90
+ "words": ['facer', 'fijo', 'fermoso', 'agora', 'asaz', 'aver', 'caſa', 'deſde', 'eſte',
91
+ 'eſta', 'eſto', 'deſto', 'deſta', 'eſſo', 'muger', 'dixo', 'fazer'],
92
+ "patterns": ['^f[aei]', 'ſſ', 'ſc', '^deſ', 'xo$', 'xe$']
93
+ },
94
+ },
95
+ "Italian": {
96
+ "chars": ['à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'],
97
+ "words": ['il', 'la', 'i', 'le', 'e', 'di', 'a', 'in', 'che', 'non', 'per', 'con', 'un',
98
+ 'una', 'del', 'della', 'è', 'sono', 'da', 'si', 'come', 'anche', 'più', 'ma', 'ci',
99
+ 'se', 'ha', 'mi', 'lo', 'ti', 'al', 'tu', 'questo', 'questi'],
100
+ "ngrams": ['di', 'la', 'er', 'to', 're', 'co', 'de', 'in', 'ra', 'on', 'li', 'no', 'ri',
101
+ 'che', 'ent', 'con', 'per', 'ion', 'ato', 'lla']
102
+ },
103
+ "Portuguese": {
104
+ "chars": ['á', 'â', 'ã', 'à', 'é', 'ê', 'í', 'ó', 'ô', 'õ', 'ú', 'ç'],
105
+ "words": ['o', 'a', 'os', 'as', 'de', 'em', 'e', 'do', 'da', 'dos', 'das', 'no', 'na',
106
+ 'para', 'que', 'um', 'uma', 'por', 'com', 'se', 'não', 'mais', 'como', 'mas',
107
+ 'você', 'eu', 'este', 'isso', 'ele', 'seu', 'sua', 'ou', 'já', 'me'],
108
+ "ngrams": ['de', 'os', 'em', 'ar', 'es', 'ra', 'do', 'da', 'en', 'co', 'nt', 'ad', 'to',
109
+ 'que', 'nto', 'ent', 'com', 'ção', 'ado', 'ment']
110
+ },
111
+ "Dutch": {
112
+ "chars": ['ë', 'ï', 'ö', 'ü', 'é', 'è', 'ê', 'ç', 'á', 'à', 'ä', 'ó', 'ô', 'ú', 'ù', 'û', 'ij'],
113
+ "words": ['de', 'het', 'een', 'en', 'van', 'in', 'is', 'dat', 'op', 'te', 'zijn', 'met',
114
+ 'voor', 'niet', 'aan', 'er', 'die', 'maar', 'dan', 'ik', 'je', 'hij', 'zij', 'we',
115
+ 'kunnen', 'wordt', 'nog', 'door', 'over', 'als', 'uit', 'bij', 'om', 'ook'],
116
+ "ngrams": ['en', 'de', 'er', 'ee', 'ge', 'an', 'aa', 'in', 'te', 'et', 'ng', 'ee', 'or',
117
+ 'van', 'het', 'een', 'ing', 'ver', 'den', 'sch']
118
+ },
119
+ "Russian": {
120
+ # Russian (Cyrillic alphabet) characters
121
+ "chars": ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
122
+ 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'],
123
+ "words": ['и', 'в', 'не', 'на', 'что', 'я', 'с', 'а', 'то', 'он', 'как', 'этот', 'по',
124
+ 'но', 'из', 'к', 'у', 'за', 'вы', 'все', 'так', 'же', 'от', 'для', 'о', 'его',
125
+ 'мы', 'было', 'она', 'бы', 'мне', 'еще', 'есть', 'быть', 'был'],
126
+ "ngrams": ['о', 'е', 'а', 'н', 'и', 'т', 'р', 'с', 'в', 'л', 'к', 'м', 'д',
127
+ 'ст', 'но', 'то', 'ни', 'на', 'по', 'ет']
128
+ },
129
+ "Chinese": {
130
+ "chars": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
131
+ '个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就',
132
+ '年', '生', '对', '能', '自', '那', '都', '得', '说', '过', '子', '家', '后', '多'],
133
+ # Chinese doesn't have "words" in the same way as alphabetic languages
134
+ "words": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
135
+ '个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就'],
136
+ "ngrams": ['的', '是', '不', '了', '在', '我', '有', '和', '人', '这', '中', '大', '来', '上',
137
+ '国', '个', '到', '说', '们', '为']
138
+ },
139
+ "Japanese": {
140
+ # A mix of hiragana, katakana, and common kanji
141
+ "chars": ['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ',
142
+ 'ア', 'イ', 'ウ', 'エ', 'オ', 'カ', 'キ', 'ク', 'ケ', 'コ', 'サ', 'シ', 'ス', 'セ', 'ソ',
143
+ '日', '本', '人', '大', '小', '中', '山', '川', '田', '子', '女', '男', '月', '火', '水'],
144
+ "words": ['は', 'を', 'に', 'の', 'が', 'で', 'へ', 'から', 'より', 'まで', 'だ', 'です', 'した',
145
+ 'ます', 'ません', 'です', 'これ', 'それ', 'あれ', 'この', 'その', 'あの', 'わたし'],
146
+ "ngrams": ['の', 'は', 'た', 'が', 'を', 'に', 'て', 'で', 'と', 'し', 'か', 'ま', 'こ', 'い',
147
+ 'する', 'いる', 'れる', 'なる', 'れて', 'した']
148
+ },
149
+ "Korean": {
150
+ "chars": ['가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하',
151
+ '그', '는', '을', '이', '에', '에서', '로', '으로', '와', '과', '또는', '하지만'],
152
+ "words": ['이', '그', '저', '나', '너', '우리', '그들', '이것', '그것', '저것', '은', '는',
153
+ '이', '가', '을', '를', '에', '에서', '으로', '로', '와', '과', '의', '하다', '되다'],
154
+ "ngrams": ['이', '다', '는', '에', '하', '고', '지', '서', '의', '가', '을', '로', '을', '으',
155
+ '니다', '습니', '하는', '이다', '에서', '하고']
156
+ },
157
+ "Arabic": {
158
+ "chars": ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض',
159
+ 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'ء', 'ة', 'ى'],
160
+ "words": ['في', 'من', 'على', 'إلى', 'هذا', 'هذه', 'ذلك', 'تلك', 'هو', 'هي', 'هم', 'أنا',
161
+ 'أنت', 'نحن', 'كان', 'كانت', 'يكون', 'لا', 'لم', 'ما', 'أن', 'و', 'أو', 'ثم', 'بعد'],
162
+ "ngrams": ['ال', 'ان', 'في', 'من', 'ون', 'ين', 'ات', 'ار', 'ور', 'ما', 'لا', 'ها', 'ان',
163
+ 'الم', 'لان', 'علا', 'الح', 'الس', 'الع', 'الت']
164
+ },
165
+ "Hindi": {
166
+ "chars": ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ',
167
+ 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न',
168
+ 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी',
169
+ 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्', 'ं', 'ः'],
170
+ "words": ['और', 'का', 'के', 'की', 'एक', 'में', 'है', 'यह', 'हैं', 'से', 'को', 'पर', 'इस',
171
+ 'हो', 'गया', 'कर', 'मैं', 'या', 'हुआ', 'था', 'वह', 'अपने', 'सकता', 'ने', 'बहुत'],
172
+ "ngrams": ['का', 'के', 'की', 'है', 'ने', 'से', 'मे', 'को', 'पर', 'हा', 'रा', 'ता', 'या',
173
+ 'ार', 'ान', 'कार', 'राज', 'ारा', 'जाए', 'ेजा']
174
+ },
175
+ "Latin": {
176
+ "chars": [], # Latin uses basic Latin alphabet
177
+ "words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod', 'ut', 'si',
178
+ 'nec', 'ex', 'per', 'quam', 'pro', 'iam', 'hoc', 'aut', 'esse', 'enim', 'de',
179
+ 'atque', 'ac', 'ante', 'post', 'sub', 'ab'],
180
+ "ngrams": ['us', 'is', 'um', 'er', 'it', 'nt', 'am', 'em', 're', 'at', 'ti', 'es', 'ur',
181
+ 'tur', 'que', 'ere', 'ent', 'ius', 'rum', 'tus']
182
+ },
183
+ "Greek": {
184
+ "chars": ['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π',
185
+ 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'ά', 'έ', 'ή', 'ί', 'ό', 'ύ', 'ώ'],
186
+ "words": ['και', 'του', 'της', 'των', 'στο', 'στη', 'με', 'από', 'για', 'είναι', 'να',
187
+ 'ότι', 'δεν', 'στον', 'μια', 'που', 'ένα', 'έχει', 'θα', 'το', 'ο', 'η', 'τον'],
188
+ "ngrams": ['αι', 'τα', 'ου', 'τη', 'οι', 'το', 'ης', 'αν', 'ος', 'ον', 'ις', 'ει', 'ερ',
189
+ 'και', 'την', 'τον', 'ους', 'νου', 'εντ', 'μεν']
190
+ }
191
+ }
192
+
193
+ def detect_languages(self, text: str, filename: str = None, current_languages: List[str] = None) -> List[str]:
194
+ """
195
+ Detect languages in text using an enhanced statistical approach
196
+
197
+ Args:
198
+ text: Text to analyze
199
+ filename: Optional filename to provide additional context
200
+ current_languages: Optional list of languages already detected
201
+
202
+ Returns:
203
+ List of detected languages
204
+ """
205
+ logger = logging.getLogger("language_detector")
206
+
207
+ # If no text provided, return current languages or default
208
+ if not text or len(text.strip()) < 10:
209
+ return current_languages if current_languages else ["English"]
210
+
211
+ # If we already have detected languages, use them
212
+ if current_languages and len(current_languages) > 0:
213
+ logger.info(f"Using already detected languages: {current_languages}")
214
+ return current_languages
215
+
216
+ # Use enhanced statistical detection
217
+ detected_languages = self._detect_statistically(text, filename)
218
+ logger.info(f"Statistical language detection results: {detected_languages}")
219
+ return detected_languages
220
+
221
+ def _detect_statistically(self, text: str, filename: str = None) -> List[str]:
222
+ """
223
+ Detect languages using enhanced statistical analysis with historical language indicators
224
+
225
+ Args:
226
+ text: Text to analyze
227
+ filename: Optional filename for additional context
228
+
229
+ Returns:
230
+ List of detected languages
231
+ """
232
+ logger = logging.getLogger("language_detector")
233
+
234
+ # Normalize text to lowercase for consistent analysis
235
+ text_lower = text.lower()
236
+ words = re.findall(r'\b\w+\b', text_lower) # Extract words
237
+
238
+ # Score each language based on characters, words, n-grams, and historical markers
239
+ language_scores = {}
240
+ historical_bonus = {}
241
+
242
+ # PHASE 1: Special character analysis
243
+ # Count special characters for each language
244
+ special_char_counts = {}
245
+ total_special_chars = 0
246
+
247
+ for language, indicators in self.language_indicators.items():
248
+ chars = indicators["chars"]
249
+ count = 0
250
+ for char in chars:
251
+ if char in text_lower:
252
+ count += text_lower.count(char)
253
+ special_char_counts[language] = count
254
+ total_special_chars += count
255
+
256
+ # Normalize character scores (0-30 points)
257
+ for language, count in special_char_counts.items():
258
+ if total_special_chars > 0:
259
+ # Scale score to 0-30 range (reduced from 35 to make room for historical)
260
+ normalized_score = (count / total_special_chars) * 30
261
+ language_scores[language] = normalized_score
262
+ else:
263
+ language_scores[language] = 0
264
+
265
+ # PHASE 2: Word analysis (0-30 points)
266
+ # Count common words for each language
267
+ for language, indicators in self.language_indicators.items():
268
+ word_list = indicators["words"]
269
+ word_matches = sum(1 for word in words if word in word_list)
270
+
271
+ # Normalize word score based on text length and word list size
272
+ word_score_factor = min(1.0, word_matches / (len(words) * 0.1)) # Max 1.0 if 10% match
273
+ language_scores[language] = language_scores.get(language, 0) + (word_score_factor * 30)
274
+
275
+ # PHASE 3: N-gram analysis (0-20 points)
276
+ for language, indicators in self.language_indicators.items():
277
+ ngram_list = indicators["ngrams"]
278
+ ngram_matches = 0
279
+
280
+ # Count ngram occurrences
281
+ for ngram in ngram_list:
282
+ ngram_matches += text_lower.count(ngram)
283
+
284
+ # Normalize ngram score based on text length
285
+ if len(text_lower) > 0:
286
+ ngram_score_factor = min(1.0, ngram_matches / (len(text_lower) * 0.05)) # Max 1.0 if 5% match
287
+ language_scores[language] = language_scores.get(language, 0) + (ngram_score_factor * 20)
288
+
289
+ # PHASE 4: Historical language markers (0-20 points)
290
+ for language, indicators in self.language_indicators.items():
291
+ if "historical" in indicators:
292
+ historical_indicators = indicators["historical"]
293
+ historical_score = 0
294
+
295
+ # Check for historical chars
296
+ if "chars" in historical_indicators:
297
+ for char in historical_indicators["chars"]:
298
+ if char in text_lower:
299
+ historical_score += text_lower.count(char) * 0.5
300
+
301
+ # Check for historical words
302
+ if "words" in historical_indicators:
303
+ hist_words = historical_indicators["words"]
304
+ hist_word_matches = sum(1 for word in words if word in hist_words)
305
+ if hist_word_matches > 0:
306
+ # Historical words are strong indicators
307
+ historical_score += min(10, hist_word_matches * 2)
308
+
309
+ # Check for historical patterns
310
+ if "patterns" in historical_indicators:
311
+ for pattern in historical_indicators["patterns"]:
312
+ matches = len(re.findall(pattern, text_lower))
313
+ if matches > 0:
314
+ historical_score += min(5, matches * 0.5)
315
+
316
+ # Cap historical score at 20 points
317
+ historical_score = min(20, historical_score)
318
+ historical_bonus[language] = historical_score
319
+
320
+ # Apply historical bonus
321
+ language_scores[language] += historical_score
322
+
323
+ # Apply language-specific exclusivity multiplier if present
324
+ if "exclusivity" in indicators:
325
+ exclusivity = indicators["exclusivity"]
326
+ language_scores[language] *= exclusivity
327
+ logger.info(f"Applied exclusivity multiplier {exclusivity} to {language}")
328
+
329
+ # Print historical bonus for debugging
330
+ for language, bonus in historical_bonus.items():
331
+ if bonus > 0:
332
+ logger.info(f"Historical language bonus for {language}: {bonus} points")
333
+
334
+ # Final language selection with more stringent criteria
335
+ # Get languages with scores above threshold
336
+ threshold = self.single_lang_confidence # Higher minimum score
337
+ candidates = [(lang, score) for lang, score in language_scores.items() if score >= threshold]
338
+ candidates.sort(key=lambda x: x[1], reverse=True)
339
+
340
+ logger.info(f"Language candidates: {candidates}")
341
+
342
+ # If we have candidate languages, return top 1-2 with higher threshold for secondary
343
+ if candidates:
344
+ # Always take top language
345
+ result = [candidates[0][0]]
346
+
347
+ # Add second language only if it's significantly strong compared to primary
348
+ # and doesn't have a historical/exclusivity conflict
349
+ if len(candidates) > 1:
350
+ primary_lang = candidates[0][0]
351
+ secondary_lang = candidates[1][0]
352
+ primary_score = candidates[0][1]
353
+ secondary_score = candidates[1][1]
354
+
355
+ # Only add secondary if it meets threshold and doesn't conflict
356
+ ratio = secondary_score / primary_score
357
+
358
+ # Check for French and Spanish conflict (historical French often gets misidentified)
359
+ historical_conflict = False
360
+ if (primary_lang == "French" and secondary_lang == "Spanish" and
361
+ historical_bonus.get("French", 0) > 5):
362
+ historical_conflict = True
363
+ logger.info("Historical French markers detected, suppressing Spanish detection")
364
+
365
+ if ratio >= self.secondary_lang_threshold and not historical_conflict:
366
+ result.append(secondary_lang)
367
+ logger.info(f"Added secondary language {secondary_lang} (score ratio: {ratio:.2f})")
368
+ else:
369
+ logger.info(f"Rejected secondary language {secondary_lang} (score ratio: {ratio:.2f})")
370
+
371
+ return result
372
+
373
+ # Default to English if no clear signals
utils/helpers/letterhead_handler.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard library imports
2
+ import os
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ # Configure logging
7
+ logging.basicConfig(level=logging.INFO,
8
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def is_likely_letterhead(file_path, features=None):
12
+ """
13
+ Determine if a document is likely to contain letterhead or marginalia
14
+
15
+ Args:
16
+ file_path: Path to the document image
17
+ features: Optional dictionary of pre-extracted features like text density
18
+
19
+ Returns:
20
+ bool: True if the document likely contains letterhead, False otherwise
21
+ """
22
+ # Simple logic based on filename for initial version
23
+ file_name = Path(file_path).name.lower()
24
+ letterhead_indicators = ['letter', 'letterhead', 'correspondence', 'memo']
25
+
26
+ # Check filename for indicators
27
+ for indicator in letterhead_indicators:
28
+ if indicator in file_name:
29
+ logger.info(f"Letterhead detected based on filename: {file_name}")
30
+ return True
31
+
32
+ # Check features if provided
33
+ if features:
34
+ # High text density at the top of the document may indicate letterhead
35
+ if 'top_density' in features and features['top_density'] > 0.5:
36
+ logger.info(f"Letterhead detected based on top text density: {features['top_density']}")
37
+ return True
38
+
39
+ # Uneven text distribution may indicate marginalia
40
+ if 'density_variance' in features and features['density_variance'] > 0.3:
41
+ logger.info(f"Possible marginalia detected based on text density variance")
42
+ return True
43
+
44
+ # Default to standard document
45
+ return False
46
+
47
+ def get_letterhead_prompt(file_path, features=None):
48
+ """
49
+ Generate a specialized prompt for letterhead document OCR
50
+
51
+ Args:
52
+ file_path: Path to the document image
53
+ features: Optional dictionary of pre-extracted features
54
+
55
+ Returns:
56
+ str: Specialized prompt for letterhead document OCR
57
+ """
58
+ # Base prompt for all letterhead documents
59
+ base_prompt = ("This document appears to be a letter or includes letterhead elements. "
60
+ "Please extract the following components separately if present:\n"
61
+ "1. Letterhead (header with logo, organization name, address, etc.)\n"
62
+ "2. Date\n"
63
+ "3. Recipient information (address, name, title)\n"
64
+ "4. Salutation (e.g., 'Dear Sir/Madam')\n"
65
+ "5. Main body text\n"
66
+ "6. Closing (e.g., 'Sincerely')\n"
67
+ "7. Signature\n"
68
+ "8. Any footnotes, marginalia, or annotations\n\n"
69
+ "Preserve the original formatting and structure as much as possible.")
70
+
71
+ # Enhanced prompts based on features
72
+ if features:
73
+ # Extract additional context from features if available
74
+ if 'is_historical' in features and features['is_historical']:
75
+ base_prompt += ("\n\nThis appears to be a historical document. Pay special attention to older "
76
+ "letterhead styles, formal language patterns, and period-specific formatting.")
77
+
78
+ if 'has_marginalia' in features and features['has_marginalia']:
79
+ base_prompt += ("\n\nThe document contains marginalia or handwritten notes in the margins. "
80
+ "Please extract these separately from the main text and indicate their position.")
81
+
82
+ return base_prompt
utils/helpers/ocr_text_repair.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard library imports
2
+ import re
3
+ import logging
4
+ from difflib import SequenceMatcher
5
+ from typing import Tuple, Dict, Any, List, Optional
6
+
7
+ # Configure logging
8
+ logging.basicConfig(level=logging.INFO,
9
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def detect_duplicate_text_issues(text: str) -> Tuple[bool, Dict[str, Any]]:
13
+ """
14
+ Detect if OCR text has duplication issues often found in handwritten document OCR
15
+
16
+ Args:
17
+ text: OCR text to analyze
18
+
19
+ Returns:
20
+ Tuple of (has_duplication_issues, details_dict)
21
+ """
22
+ # Early exit for empty text
23
+ if not text or len(text) < 100:
24
+ return False, {"duplication_rate": 0.0, "details": "Text too short for analysis"}
25
+
26
+ # Look for repeated line patterns
27
+ lines = text.split('\n')
28
+ line_count = len(lines)
29
+
30
+ # Basic metrics
31
+ repeated_lines = 0
32
+ duplicate_sections = []
33
+ line_repetition_indices = []
34
+
35
+ # Check for exact line repetitions
36
+ seen_lines = {}
37
+ for i, line in enumerate(lines):
38
+ # Skip very short lines or empty lines
39
+ stripped = line.strip()
40
+ if len(stripped) < 5:
41
+ continue
42
+
43
+ if stripped in seen_lines:
44
+ repeated_lines += 1
45
+ line_repetition_indices.append((seen_lines[stripped], i))
46
+ else:
47
+ seen_lines[stripped] = i
48
+
49
+ # Calculate line repetition rate
50
+ line_repetition_rate = repeated_lines / max(1, line_count)
51
+
52
+ # Look for longer repeated sections using sequence matcher
53
+ text_blocks = [text[i:i+100] for i in range(0, len(text), 100) if i+100 <= len(text)]
54
+ block_count = len(text_blocks)
55
+
56
+ repeated_blocks = 0
57
+ for i in range(block_count):
58
+ for j in range(i+1, min(i+10, block_count)): # Only check nearby blocks for efficiency
59
+ matcher = SequenceMatcher(None, text_blocks[i], text_blocks[j])
60
+ similarity = matcher.ratio()
61
+ if similarity > 0.8: # High similarity threshold
62
+ repeated_blocks += 1
63
+ duplicate_sections.append((i, j, similarity))
64
+ break
65
+
66
+ # Calculate block repetition rate
67
+ block_repetition_rate = repeated_blocks / max(1, block_count)
68
+
69
+ # Combine metrics for overall duplication rate
70
+ duplication_rate = max(line_repetition_rate, block_repetition_rate)
71
+
72
+ # Detect patterns of repeated words in sequence (common OCR mistake)
73
+ word_pattern = r'\b(\w+)\s+\1\b'
74
+ repeated_words = len(re.findall(word_pattern, text))
75
+ repeated_words_rate = repeated_words / max(1, len(text.split()))
76
+
77
+ # Update duplication rate with word repetition
78
+ duplication_rate = max(duplication_rate, repeated_words_rate)
79
+
80
+ # Log detailed analysis
81
+ logger.info(f"OCR duplication analysis: line_repetition={line_repetition_rate:.2f}, "
82
+ f"block_repetition={block_repetition_rate:.2f}, "
83
+ f"word_repetition={repeated_words_rate:.2f}, "
84
+ f"final_rate={duplication_rate:.2f}")
85
+
86
+ # Determine if this is a serious issue
87
+ has_duplication = duplication_rate > 0.1
88
+
89
+ # Return detailed results
90
+ return has_duplication, {
91
+ "duplication_rate": duplication_rate,
92
+ "line_repetition_rate": line_repetition_rate,
93
+ "block_repetition_rate": block_repetition_rate,
94
+ "word_repetition_rate": repeated_words_rate,
95
+ "repeated_lines": repeated_lines,
96
+ "repeated_blocks": repeated_blocks,
97
+ "repeated_words": repeated_words,
98
+ "duplicate_sections": duplicate_sections[:10], # Only include the first 10 for brevity
99
+ "repetition_indices": line_repetition_indices[:10]
100
+ }
101
+
102
+ def get_enhanced_preprocessing_options(current_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
103
+ """
104
+ Generate enhanced preprocessing options for improved OCR on handwritten documents
105
+
106
+ Args:
107
+ current_options: Current preprocessing options (if available)
108
+
109
+ Returns:
110
+ Dict of enhanced options
111
+ """
112
+ # Start with current options or empty dict
113
+ options = current_options.copy() if current_options else {}
114
+
115
+ # Set document type to handwritten
116
+ options["document_type"] = "handwritten"
117
+
118
+ # Enhanced contrast - higher than normal for better handwriting extraction
119
+ options["contrast"] = 1.4 # Higher than default
120
+
121
+ # Apply grayscale
122
+ options["grayscale"] = True
123
+
124
+ # Apply adaptive thresholding optimized for handwriting
125
+ options["adaptive_threshold"] = True
126
+ options["threshold_block_size"] = 25 # Larger block size for handwriting
127
+ options["threshold_c"] = 10 # Adjusted C value for better handwriting detection
128
+
129
+ # Disable standard binarization which often loses handwriting detail
130
+ options["binarize"] = False
131
+
132
+ # Despeckle to reduce noise
133
+ options["denoise"] = True
134
+
135
+ # Enable handwriting-specific preprocessing
136
+ options["handwriting_mode"] = True
137
+
138
+ # Disable anything that might harm handwriting recognition
139
+ if "sharpen" in options:
140
+ options["sharpen"] = False
141
+
142
+ logger.info(f"Enhanced handwriting preprocessing options generated: {options}")
143
+ return options
144
+
145
+ def get_handwritten_specific_prompt(current_prompt: Optional[str] = None) -> str:
146
+ """
147
+ Generate a specialized prompt for handwritten document OCR
148
+
149
+ Args:
150
+ current_prompt: Current prompt (if available)
151
+
152
+ Returns:
153
+ str: Enhanced prompt for handwritten documents
154
+ """
155
+ # Base prompt for all handwritten documents
156
+ base_prompt = ("This is a handwritten document that requires careful transcription. "
157
+ "Please transcribe all visible handwritten text, preserving the original "
158
+ "line breaks, paragraph structure, and any special formatting or indentation. "
159
+ "Pay special attention to:\n"
160
+ "1. Words that may be difficult to read due to handwriting style\n"
161
+ "2. Any crossed-out text (indicate with [crossed out: possible text])\n"
162
+ "3. Insertions or annotations between lines or in margins\n"
163
+ "4. Maintain the spatial layout of the text as much as possible\n"
164
+ "5. If there are multiple columns or non-linear text, preserve the reading order\n\n"
165
+ "If you cannot read a word with confidence, indicate with [?] or provide your best guess as [word?].")
166
+
167
+ # If there's an existing prompt, combine them, otherwise just use the base
168
+ if current_prompt:
169
+ # Remove any redundant instructions about handwriting
170
+ lower_prompt = current_prompt.lower()
171
+ if "handwritten" in lower_prompt or "handwriting" in lower_prompt:
172
+ # Extract any unique instructions from the current prompt
173
+ # This logic is simplified and might need improvement
174
+ current_sentences = [s.strip() for s in current_prompt.split('.') if s.strip()]
175
+ handwriting_sentences = [s for s in current_sentences
176
+ if "handwritten" not in s.lower()
177
+ and "handwriting" not in s.lower()]
178
+
179
+ # Add unique instructions to our base prompt
180
+ if handwriting_sentences:
181
+ combined_prompt = base_prompt + "\n\nAdditional instructions:\n"
182
+ combined_prompt += ". ".join(handwriting_sentences) + "."
183
+ return combined_prompt
184
+ else:
185
+ # If no handwriting instructions in the current prompt, just append it
186
+ return f"{base_prompt}\n\nAdditional context from user:\n{current_prompt}"
187
+
188
+ return base_prompt
189
+
190
+ def clean_duplicated_text(text: str) -> str:
191
+ """
192
+ Clean up duplicated text often found in OCR output for handwritten documents
193
+
194
+ Args:
195
+ text: OCR text to clean
196
+
197
+ Returns:
198
+ str: Cleaned text with duplications removed
199
+ """
200
+ if not text:
201
+ return text
202
+
203
+ # Split into lines for line-based deduplication
204
+ lines = text.split('\n')
205
+
206
+ # Remove consecutive duplicate lines
207
+ deduped_lines = []
208
+ prev_line = None
209
+
210
+ for line in lines:
211
+ stripped = line.strip()
212
+ # Skip empty lines
213
+ if not stripped:
214
+ if not deduped_lines or deduped_lines[-1].strip():
215
+ deduped_lines.append(line) # Keep the first empty line
216
+ continue
217
+
218
+ # Skip if this line is a duplicate of the previous line
219
+ if stripped == prev_line:
220
+ continue
221
+
222
+ deduped_lines.append(line)
223
+ prev_line = stripped
224
+
225
+ # Re-join the deduplicated lines
226
+ deduped_text = '\n'.join(deduped_lines)
227
+
228
+ # Remove repeated words
229
+ word_pattern = r'\b(\w+)\s+\1\b'
230
+ deduped_text = re.sub(word_pattern, r'\1', deduped_text)
231
+
232
+ # Remove repeated phrases (3+ words)
233
+ # This is a simplified approach and might need improvement
234
+ words = deduped_text.split()
235
+ cleaned_words = []
236
+ i = 0
237
+
238
+ while i < len(words):
239
+ # Check for phrase repetition (phrases of 3 to 6 words)
240
+ found_repeat = False
241
+
242
+ for phrase_len in range(3, min(7, len(words) - i)):
243
+ phrase = ' '.join(words[i:i+phrase_len])
244
+ next_pos = i + phrase_len
245
+
246
+ if next_pos + phrase_len <= len(words):
247
+ next_phrase = ' '.join(words[next_pos:next_pos+phrase_len])
248
+
249
+ if phrase.lower() == next_phrase.lower():
250
+ # Found a repeated phrase, skip the second occurrence
251
+ cleaned_words.extend(words[i:i+phrase_len])
252
+ i = next_pos + phrase_len
253
+ found_repeat = True
254
+ break
255
+
256
+ if not found_repeat:
257
+ cleaned_words.append(words[i])
258
+ i += 1
259
+
260
+ # Rejoin the cleaned words
261
+ final_text = ' '.join(cleaned_words)
262
+
263
+ # Log the cleaning results
264
+ original_len = len(text)
265
+ cleaned_len = len(final_text)
266
+ reduction = 100 * (original_len - cleaned_len) / max(1, original_len)
267
+
268
+ logger.info(f"Text cleaning: removed {original_len - cleaned_len} chars ({reduction:.1f}% reduction)")
269
+
270
+ return final_text
utils/pdf_ocr.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDFOCR - Module for processing PDF files with OCR and extracting structured data.
4
+ Provides robust PDF to image conversion before OCR processing.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import tempfile
10
+ import logging
11
+ from pathlib import Path
12
+ from typing import Optional, Dict, List, Union, Tuple, Any
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger("pdf_ocr")
18
+
19
+ # Import StructuredOCR for OCR processing
20
+ from structured_ocr import StructuredOCR
21
+
22
+ class PDFConversionResult:
23
+ """Class to hold results of PDF to image conversion."""
24
+
25
+ def __init__(self,
26
+ success: bool,
27
+ images: List[Path] = None,
28
+ error: str = None,
29
+ page_count: int = 0,
30
+ temp_files: List[str] = None):
31
+ """Initialize the conversion result.
32
+
33
+ Args:
34
+ success: Whether the conversion was successful
35
+ images: List of paths to the converted images
36
+ error: Error message if conversion failed
37
+ page_count: Total number of pages in the PDF
38
+ temp_files: List of temporary files that should be cleaned up
39
+ """
40
+ self.success = success
41
+ self.images = images or []
42
+ self.error = error
43
+ self.page_count = page_count
44
+ self.temp_files = temp_files or []
45
+
46
+ def __bool__(self):
47
+ """Enable boolean evaluation of the result."""
48
+ return self.success
49
+
50
+ def cleanup(self):
51
+ """Clean up any temporary files created during conversion."""
52
+ for temp_file in self.temp_files:
53
+ try:
54
+ if os.path.exists(temp_file):
55
+ os.unlink(temp_file)
56
+ logger.debug(f"Removed temporary file: {temp_file}")
57
+ except Exception as e:
58
+ logger.warning(f"Failed to remove temporary file {temp_file}: {e}")
59
+ self.temp_files = []
60
+
61
+
62
+ class PDFOCR:
63
+ """Class for processing PDF files with OCR and extracting structured data."""
64
+
65
+ def __init__(self, api_key=None):
66
+ """Initialize the PDF OCR processor."""
67
+ self.processor = StructuredOCR(api_key=api_key)
68
+ self.temp_files = []
69
+
70
+ def __del__(self):
71
+ """Clean up resources when object is destroyed."""
72
+ self.cleanup()
73
+
74
+ def cleanup(self):
75
+ """Clean up any temporary files."""
76
+ for temp_file in self.temp_files:
77
+ try:
78
+ if os.path.exists(temp_file):
79
+ os.unlink(temp_file)
80
+ logger.debug(f"Removed temporary file: {temp_file}")
81
+ except Exception as e:
82
+ logger.warning(f"Failed to remove temporary file {temp_file}: {e}")
83
+ self.temp_files = []
84
+
85
+ def convert_pdf_to_images(self,
86
+ pdf_path: Union[str, Path],
87
+ dpi: int = 200,
88
+ max_pages: Optional[int] = None,
89
+ page_numbers: Optional[List[int]] = None) -> PDFConversionResult:
90
+ """
91
+ Convert a PDF file to images.
92
+
93
+ Args:
94
+ pdf_path: Path to the PDF file
95
+ dpi: DPI for the output images
96
+ max_pages: Maximum number of pages to convert (None for all)
97
+ page_numbers: Specific page numbers to convert (1-based indexing)
98
+
99
+ Returns:
100
+ PDFConversionResult object with conversion results
101
+ """
102
+ pdf_path = Path(pdf_path)
103
+ if not pdf_path.exists():
104
+ return PDFConversionResult(
105
+ success=False,
106
+ error=f"PDF file not found: {pdf_path}"
107
+ )
108
+
109
+ # Check file size
110
+ file_size_mb = pdf_path.stat().st_size / (1024 * 1024)
111
+ logger.info(f"PDF size: {file_size_mb:.2f} MB")
112
+
113
+ try:
114
+ # Import pdf2image for conversion
115
+ import pdf2image
116
+
117
+ # Initialize list for temporary files
118
+ temp_files = []
119
+
120
+ # Optimize conversion parameters based on file size
121
+ thread_count = min(4, os.cpu_count() or 2)
122
+
123
+ # First, determine total pages in the document
124
+ logger.info("Determining PDF page count...")
125
+ try:
126
+ # Use a lightweight approach with multi-threading for faster processing
127
+ pdf_info = pdf2image.convert_from_path(
128
+ pdf_path,
129
+ dpi=72, # Low DPI just for info
130
+ first_page=1,
131
+ last_page=1,
132
+ size=(100, 100), # Tiny image to save memory
133
+ fmt="jpeg",
134
+ thread_count=thread_count,
135
+ output_file=None
136
+ )
137
+
138
+ # Get page count from poppler info if available
139
+ if hasattr(pdf_info, 'n_pages'):
140
+ total_pages = pdf_info.n_pages
141
+ else:
142
+ # Try a different approach to get page count
143
+ try:
144
+ from pypdf import PdfReader
145
+ reader = PdfReader(pdf_path)
146
+ total_pages = len(reader.pages)
147
+ except:
148
+ total_pages = 1
149
+ logger.warning("Could not determine total page count, assuming 1 page")
150
+ except Exception as e:
151
+ logger.warning(f"Failed to determine page count: {e}")
152
+ total_pages = 1
153
+
154
+ logger.info(f"PDF has {total_pages} total pages")
155
+
156
+ # Determine which pages to process
157
+ pages_to_process = []
158
+
159
+ # If specific pages are requested, use those
160
+ if page_numbers and any(1 <= p <= total_pages for p in page_numbers):
161
+ pages_to_process = [p for p in page_numbers if 1 <= p <= total_pages]
162
+ logger.info(f"Converting {len(pages_to_process)} specified pages: {pages_to_process}")
163
+ # If max_pages is set, limit to that number
164
+ elif max_pages and max_pages < total_pages:
165
+ pages_to_process = list(range(1, max_pages + 1))
166
+ logger.info(f"Converting first {max_pages} pages of {total_pages} total")
167
+ # Otherwise convert all pages if reasonable count
168
+ else:
169
+ pages_to_process = list(range(1, total_pages + 1))
170
+ logger.info(f"Converting all {total_pages} pages")
171
+
172
+ # Convert PDF to images
173
+ converted_images = []
174
+
175
+ # Process in batches for better memory management
176
+ batch_size = min(5, len(pages_to_process)) # Process up to 5 pages at a time
177
+ for i in range(0, len(pages_to_process), batch_size):
178
+ batch_pages = pages_to_process[i:i+batch_size]
179
+ logger.info(f"Converting batch of pages {batch_pages}")
180
+
181
+ # Convert this batch of pages
182
+ try:
183
+ batch_images = pdf2image.convert_from_path(
184
+ pdf_path,
185
+ dpi=dpi,
186
+ first_page=min(batch_pages),
187
+ last_page=max(batch_pages),
188
+ thread_count=thread_count,
189
+ fmt="jpeg"
190
+ )
191
+
192
+ # Map converted images to requested page numbers
193
+ for idx, page_num in enumerate(range(min(batch_pages), max(batch_pages) + 1)):
194
+ if page_num in pages_to_process and idx < len(batch_images):
195
+ # Save the image to a temporary file
196
+ img_temp_path = tempfile.NamedTemporaryFile(suffix=f'_page{page_num}.jpg', delete=False).name
197
+ batch_images[idx].save(img_temp_path, format='JPEG', quality=95)
198
+
199
+ # Add to results and track the temp file
200
+ converted_images.append((page_num, Path(img_temp_path)))
201
+ temp_files.append(img_temp_path)
202
+ except Exception as e:
203
+ logger.error(f"Failed to convert batch {batch_pages}: {e}")
204
+ # Continue with other batches
205
+
206
+ # Sort by page number to ensure correct order
207
+ converted_images.sort(key=lambda x: x[0])
208
+
209
+ # Extract just the image paths in correct page order
210
+ image_paths = [img_path for _, img_path in converted_images]
211
+
212
+ if not image_paths:
213
+ # No images were successfully converted
214
+ return PDFConversionResult(
215
+ success=False,
216
+ error="Failed to convert PDF to images",
217
+ page_count=total_pages,
218
+ temp_files=temp_files
219
+ )
220
+
221
+ # Store temp files for later cleanup
222
+ self.temp_files.extend(temp_files)
223
+
224
+ # Return successful result
225
+ return PDFConversionResult(
226
+ success=True,
227
+ images=image_paths,
228
+ page_count=total_pages,
229
+ temp_files=temp_files
230
+ )
231
+
232
+ except ImportError:
233
+ return PDFConversionResult(
234
+ success=False,
235
+ error="pdf2image module not available. Please install with: pip install pdf2image"
236
+ )
237
+ except Exception as e:
238
+ logger.error(f"PDF conversion error: {str(e)}")
239
+ return PDFConversionResult(
240
+ success=False,
241
+ error=f"Failed to convert PDF to images: {str(e)}"
242
+ )
243
+
244
+ def process_pdf(self, pdf_path, use_vision=True, max_pages=None, custom_pages=None, custom_prompt=None):
245
+ """
246
+ Process a PDF file with OCR and extract structured data.
247
+
248
+ Args:
249
+ pdf_path: Path to the PDF file
250
+ use_vision: Whether to use vision model for improved analysis
251
+ max_pages: Maximum number of pages to process
252
+ custom_pages: Specific page numbers to process (1-based indexing)
253
+ custom_prompt: Custom instructions for processing
254
+
255
+ Returns:
256
+ Dictionary with structured OCR results
257
+ """
258
+ pdf_path = Path(pdf_path)
259
+ if not pdf_path.exists():
260
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
261
+
262
+ # Convert page numbers to list if provided
263
+ page_numbers = None
264
+ if custom_pages:
265
+ if isinstance(custom_pages, (list, tuple)):
266
+ page_numbers = custom_pages
267
+ else:
268
+ try:
269
+ # Try to parse as comma-separated string
270
+ page_numbers = [int(p.strip()) for p in str(custom_pages).split(',')]
271
+ except:
272
+ logger.warning(f"Invalid custom_pages format: {custom_pages}. Should be list or comma-separated string.")
273
+
274
+ # First try our optimized PDF to image conversion
275
+ conversion_result = self.convert_pdf_to_images(
276
+ pdf_path=pdf_path,
277
+ max_pages=max_pages,
278
+ page_numbers=page_numbers
279
+ )
280
+
281
+ if conversion_result.success and conversion_result.images:
282
+ logger.info(f"Successfully converted PDF to {len(conversion_result.images)} images")
283
+
284
+ # Determine if we need to add PDF-specific context to the prompt
285
+ modified_prompt = custom_prompt
286
+ if not modified_prompt:
287
+ modified_prompt = f"This is a multi-page PDF document with {conversion_result.page_count} total pages, of which {len(conversion_result.images)} were processed."
288
+ elif "pdf" not in modified_prompt.lower() and "multi-page" not in modified_prompt.lower():
289
+ modified_prompt += f" This is a multi-page PDF document with {conversion_result.page_count} total pages, of which {len(conversion_result.images)} were processed."
290
+
291
+ try:
292
+ # First process the first page with vision if requested
293
+ first_page_result = self.processor.process_file(
294
+ file_path=conversion_result.images[0],
295
+ file_type="image",
296
+ use_vision=use_vision,
297
+ custom_prompt=modified_prompt
298
+ )
299
+
300
+ # Process additional pages if available
301
+ all_pages_text = []
302
+ all_languages = set()
303
+
304
+ # Extract text from first page
305
+ if 'ocr_contents' in first_page_result and 'raw_text' in first_page_result['ocr_contents']:
306
+ all_pages_text.append(first_page_result['ocr_contents']['raw_text'])
307
+
308
+ # Track languages from first page
309
+ if 'languages' in first_page_result:
310
+ for lang in first_page_result['languages']:
311
+ all_languages.add(str(lang))
312
+
313
+ # Process additional pages if any
314
+ for i, img_path in enumerate(conversion_result.images[1:], 1):
315
+ try:
316
+ # Simple text extraction for additional pages
317
+ page_result = self.processor.process_file(
318
+ file_path=img_path,
319
+ file_type="image",
320
+ use_vision=False, # Use simpler processing for additional pages
321
+ custom_prompt=f"This is page {i+1} of a {conversion_result.page_count}-page document."
322
+ )
323
+
324
+ # Extract text
325
+ if 'ocr_contents' in page_result and 'raw_text' in page_result['ocr_contents']:
326
+ all_pages_text.append(page_result['ocr_contents']['raw_text'])
327
+
328
+ # Track languages
329
+ if 'languages' in page_result:
330
+ for lang in page_result['languages']:
331
+ all_languages.add(str(lang))
332
+ except Exception as e:
333
+ logger.warning(f"Error processing page {i+1}: {e}")
334
+
335
+ # Combine all text into a single document
336
+ combined_text = "\n\n".join(all_pages_text)
337
+
338
+ # Update the first page result with combined data
339
+ if 'ocr_contents' in first_page_result:
340
+ first_page_result['ocr_contents']['raw_text'] = combined_text
341
+
342
+ # Update languages with all detected languages
343
+ if all_languages:
344
+ first_page_result['languages'] = list(all_languages)
345
+
346
+ # Add PDF metadata
347
+ first_page_result['file_name'] = pdf_path.name
348
+ first_page_result['file_type'] = "pdf"
349
+ first_page_result['total_pages'] = conversion_result.page_count
350
+ first_page_result['processed_pages'] = len(conversion_result.images)
351
+
352
+ # Add conversion info
353
+ first_page_result['pdf_conversion'] = {
354
+ "method": "pdf2image",
355
+ "pages_converted": len(conversion_result.images),
356
+ "pages_requested": len(page_numbers) if page_numbers else (max_pages or conversion_result.page_count)
357
+ }
358
+
359
+ return first_page_result
360
+ except Exception as e:
361
+ logger.error(f"Error processing converted images: {e}")
362
+ # Fall back to direct processing via StructuredOCR
363
+
364
+ finally:
365
+ # Clean up temporary files
366
+ conversion_result.cleanup()
367
+
368
+ # If conversion failed or processing the images failed, fall back to direct processing
369
+ logger.info(f"Using direct StructuredOCR processing for PDF")
370
+ return self.processor.process_file(
371
+ file_path=pdf_path,
372
+ file_type="pdf",
373
+ use_vision=use_vision,
374
+ max_pages=max_pages,
375
+ custom_pages=custom_pages,
376
+ custom_prompt=custom_prompt
377
+ )
378
+
379
+ def save_json_output(self, pdf_path, output_path, use_vision=True, max_pages=None, custom_pages=None, custom_prompt=None):
380
+ """
381
+ Process a PDF file and save the structured output as JSON.
382
+
383
+ Args:
384
+ pdf_path: Path to the PDF file
385
+ output_path: Path where to save the JSON output
386
+ use_vision: Whether to use vision model for improved analysis
387
+ max_pages: Maximum number of pages to process
388
+ custom_pages: Specific page numbers to process (1-based indexing)
389
+ custom_prompt: Custom instructions for processing
390
+
391
+ Returns:
392
+ Path to the saved JSON file
393
+ """
394
+ # Process the PDF
395
+ result = self.process_pdf(
396
+ pdf_path,
397
+ use_vision=use_vision,
398
+ max_pages=max_pages,
399
+ custom_pages=custom_pages,
400
+ custom_prompt=custom_prompt
401
+ )
402
+
403
+ # Save the result to JSON
404
+ output_path = Path(output_path)
405
+ output_path.parent.mkdir(parents=True, exist_ok=True)
406
+
407
+ with open(output_path, 'w') as f:
408
+ json.dump(result, f, indent=2)
409
+
410
+ return output_path
411
+
412
+ # For testing directly
413
+ if __name__ == "__main__":
414
+ import sys
415
+ import argparse
416
+
417
+ parser = argparse.ArgumentParser(description="Process PDF files with OCR.")
418
+ parser.add_argument("pdf_path", help="Path to the PDF file to process")
419
+ parser.add_argument("--output", "-o", help="Path to save the output JSON")
420
+ parser.add_argument("--no-vision", dest="use_vision", action="store_false",
421
+ help="Disable vision model for processing")
422
+ parser.add_argument("--max-pages", type=int, help="Maximum number of pages to process")
423
+ parser.add_argument("--pages", help="Specific pages to process (comma-separated)")
424
+ parser.add_argument("--prompt", help="Custom prompt for processing")
425
+
426
+ args = parser.parse_args()
427
+
428
+ processor = PDFOCR()
429
+
430
+ # Parse custom pages if provided
431
+ custom_pages = None
432
+ if args.pages:
433
+ try:
434
+ custom_pages = [int(p.strip()) for p in args.pages.split(',')]
435
+ except:
436
+ print(f"Error parsing pages: {args.pages}. Should be comma-separated list of numbers.")
437
+ sys.exit(1)
438
+
439
+ if args.output:
440
+ result_path = processor.save_json_output(
441
+ args.pdf_path,
442
+ args.output,
443
+ use_vision=args.use_vision,
444
+ max_pages=args.max_pages,
445
+ custom_pages=custom_pages,
446
+ custom_prompt=args.prompt
447
+ )
448
+ print(f"Results saved to: {result_path}")
449
+ else:
450
+ result = processor.process_pdf(
451
+ args.pdf_path,
452
+ use_vision=args.use_vision,
453
+ max_pages=args.max_pages,
454
+ custom_pages=custom_pages,
455
+ custom_prompt=args.prompt
456
+ )
457
+ print(json.dumps(result, indent=2))