milwright commited on
Commit
aabc02c
·
1 Parent(s): 3c4dfc4

Improve language detection with mistral-ocr-latest model

Browse files

Updates the OCR processing logic to:
1. Extract language information directly from the mistral-ocr-latest model response
2. Consolidate language detections across multiple pages for PDF documents
3. Add language_detection_source metadata to indicate when using direct model detection
4. Reduce reliance on manual language detection prompts when model provides this data
5. Maintain backward compatibility with existing code

Files changed (7) hide show
  1. app.py +205 -143
  2. ocr_processing.py +128 -16
  3. ocr_utils.py +50 -13
  4. structured_ocr.py +277 -387
  5. ui/layout.py +265 -143
  6. ui_components.py +709 -420
  7. utils.py +68 -4
app.py CHANGED
@@ -1,15 +1,23 @@
 
1
  import os
2
- import streamlit as st
3
  import json
4
  import sys
5
  import time
6
  import base64
7
- from pathlib import Path
8
  import io
9
- from datetime import datetime
10
  import logging
 
 
 
 
 
 
 
 
 
 
11
 
12
- # Import modules
13
  from preprocessing import convert_pdf_to_images, preprocess_image
14
  from ocr_processing import process_file
15
  from ui_components import (
@@ -31,19 +39,10 @@ from constants import (
31
  CUSTOM_PROMPT_TEMPLATES,
32
  LAYOUT_PROMPT_ADDITIONS
33
  )
34
-
35
- # Import the StructuredOCR class and config from the local files
36
  from structured_ocr import StructuredOCR
37
  from config import MISTRAL_API_KEY
38
-
39
- # Import utilities for handling previous results
40
  from ocr_utils import create_results_zip
41
 
42
- # Configure logging
43
- logging.basicConfig(level=logging.INFO,
44
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
45
- logger = logging.getLogger("app")
46
-
47
  # Set favicon path
48
  favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
49
 
@@ -52,28 +51,41 @@ st.set_page_config(
52
  page_title="Historical OCR",
53
  page_icon=favicon_path if os.path.exists(favicon_path) else "📜",
54
  layout="wide",
55
- initial_sidebar_state="expanded"
56
  )
57
 
58
- def initialize_session_state():
59
- """Initialize all session state variables"""
60
- # Initialize session state for storing previous results if not already present
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  if 'previous_results' not in st.session_state:
62
  st.session_state.previous_results = []
63
-
64
- # Initialize temp file tracking
65
  if 'temp_file_paths' not in st.session_state:
66
  st.session_state.temp_file_paths = []
67
-
68
- # Initialize last processed file tracking to fix "Process Document Again" button
69
  if 'last_processed_file' not in st.session_state:
70
  st.session_state.last_processed_file = None
71
-
72
- # Important: Initialize the reset flag
73
- if 'perform_reset' not in st.session_state:
74
- st.session_state.perform_reset = False
75
-
76
- # Initialize other session state variables
77
  if 'auto_process_sample' not in st.session_state:
78
  st.session_state.auto_process_sample = False
79
  if 'sample_just_loaded' not in st.session_state:
@@ -90,64 +102,62 @@ def initialize_session_state():
90
  st.session_state.original_sample_name = None
91
  if 'is_sample_document' not in st.session_state:
92
  st.session_state.is_sample_document = False
 
 
 
 
 
 
 
 
 
93
 
94
- # Check if we need to perform a complete reset (coming from "Close Document" button)
95
- if 'perform_reset' in st.session_state and st.session_state.perform_reset:
96
- # Save previous results
97
- previous_results = st.session_state.previous_results
98
-
99
- # Clean up any temporary files
100
- if 'temp_file_paths' in st.session_state and st.session_state.temp_file_paths:
101
- handle_temp_files(st.session_state.temp_file_paths)
102
-
103
- # Clear all session state variables except previous_results
104
- for key in list(st.session_state.keys()):
105
- if key not in ['previous_results']:
106
- # We will manually reset the perform_reset flag at the end
107
- if key != 'perform_reset':
108
- st.session_state.pop(key, None)
109
-
110
- # Restore previous results
111
- st.session_state.previous_results = previous_results
112
-
113
- # Reinitialize session state variables
114
- st.session_state.temp_file_paths = []
115
- st.session_state.last_processed_file = None
116
- st.session_state.auto_process_sample = False
117
- st.session_state.sample_just_loaded = False
118
- st.session_state.processed_document_active = False
119
- st.session_state.sample_document_processed = False
120
- st.session_state.sample_document = None
121
- st.session_state.original_sample_bytes = None
122
- st.session_state.original_sample_name = None
123
- st.session_state.is_sample_document = False
124
-
125
- # Turn off reset flag - this must be done last
126
- st.session_state.perform_reset = False
127
-
128
- # Force this to be a complete reset cycle
129
- return
130
 
131
  def show_example_documents():
132
  """Show example documents section"""
133
- st.subheader("Example Documents")
134
 
135
- # Add a simplified info message about examples
 
136
  st.markdown("""
137
  This app can process various historical documents:
138
  - Historical photographs, maps, and manuscripts
139
  - Handwritten letters and documents
140
  - Printed books and articles
141
  - Multi-page PDFs
142
- """)
143
 
144
- # Add CSS to make the dropdown match the column width
145
- st.markdown("""
146
  <style>
147
  /* Make the selectbox container match the full column width */
148
  .main .block-container .element-container:has([data-testid="stSelectbox"]) {
149
  width: 100% !important;
150
  max-width: 100% !important;
 
151
  }
152
 
153
  /* Make the actual selectbox control take the full width */
@@ -155,6 +165,11 @@ def show_example_documents():
155
  width: 100% !important;
156
  max-width: 100% !important;
157
  }
 
 
 
 
 
158
  </style>
159
  """, unsafe_allow_html=True)
160
 
@@ -166,7 +181,6 @@ def show_example_documents():
166
  "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
167
  "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
168
  "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
169
- "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/baldwin-15th-north.jpg"
170
  ]
171
 
172
  sample_names = [
@@ -175,9 +189,8 @@ def show_example_documents():
175
  "The Magician (Image)",
176
  "Handwritten Letter (Image)",
177
  "Magellan Travels (Image)",
178
- "Milgram Flier (Image)",
179
- "Baldwin Street (Image)"
180
- ]
181
 
182
  # Initialize sample_document in session state if it doesn't exist
183
  if 'sample_document' not in st.session_state:
@@ -188,8 +201,8 @@ def show_example_documents():
188
  if selected_sample > 0:
189
  selected_url = sample_urls[selected_sample]
190
 
191
- # Add process button for the sample document
192
- if st.button("Load Sample Document"):
193
  try:
194
  import requests
195
  from io import BytesIO
@@ -254,9 +267,10 @@ def show_example_documents():
254
  content_type=content_type
255
  )
256
 
257
- # Store original bytes for reprocessing
258
  st.session_state.original_sample_bytes = response.content
259
  st.session_state.original_sample_name = file_name
 
260
 
261
  # Set state flags
262
  st.session_state.sample_just_loaded = True
@@ -264,7 +278,8 @@ def show_example_documents():
264
  # Generate a unique identifier for the sample document
265
  st.session_state.last_processed_file = f"{file_name}_{len(response.content)}"
266
 
267
- # Force rerun to load the document
 
268
  st.rerun()
269
  except Exception as e:
270
  st.error(f"Error downloading sample document: {str(e)}")
@@ -288,20 +303,21 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
288
 
289
  # Check if this is a new file (different from the last processed file)
290
  current_file_identifier = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
 
 
 
 
 
291
  if st.session_state.last_processed_file != current_file_identifier:
292
  # Reset processed_document_active if a new file is uploaded
293
  st.session_state.processed_document_active = False
294
 
295
  # Process button - flush left with similar padding as file browser
296
  with left_col:
297
- # Use a key for the button based on state to force re-creation
298
- button_key = "process_again" if st.session_state.processed_document_active else "process_initial"
299
-
300
- # Show appropriate button text based on state
301
- button_text = "Process Document Again" if st.session_state.processed_document_active else "Process Document"
302
-
303
- # Create the button
304
- process_button = st.button(button_text, key=button_key)
305
 
306
  # Handle sample document recreation if needed
307
  if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
@@ -333,39 +349,42 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
333
  # Positioned right after the process button for better visibility
334
  progress_placeholder = st.empty()
335
 
336
- # Image preprocessing preview - automatically show only the preprocessed version
337
- if any(sidebar_options["preprocessing_options"].values()) and uploaded_file.type.startswith('image/'):
 
 
338
  st.markdown("**Preprocessed Preview**")
339
  try:
340
- # Create a container for the preview to better control layout
341
  with st.container():
342
  processed_bytes = preprocess_image(uploaded_file.getvalue(), sidebar_options["preprocessing_options"])
343
- # Use use_container_width=True for responsive design
344
- st.image(io.BytesIO(processed_bytes), use_container_width=True)
345
-
346
- # Show preprocessing metadata in a well-formatted caption
347
- meta_items = []
348
- if sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
349
- meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
350
- if sidebar_options["preprocessing_options"].get("grayscale", False):
351
- meta_items.append("Grayscale")
352
- if sidebar_options["preprocessing_options"].get("denoise", False):
353
- meta_items.append("Denoise")
354
- if sidebar_options["preprocessing_options"].get("contrast", 0) != 0:
355
- meta_items.append(f"Contrast ({sidebar_options['preprocessing_options']['contrast']})")
356
- if sidebar_options["preprocessing_options"].get("rotation", 0) != 0:
357
- meta_items.append(f"Rotation ({sidebar_options['preprocessing_options']['rotation']}°)")
358
-
359
- # Only show "Applied:" if there are actual preprocessing steps
360
- if meta_items:
361
- meta_text = "Applied: " + ", ".join(meta_items)
362
- st.caption(meta_text)
 
 
363
  except Exception as e:
364
  st.error(f"Error in preprocessing: {str(e)}")
365
  st.info("Try using grayscale preprocessing for PNG images with transparency")
366
 
367
  # Container for success message (will be filled after processing)
368
- # No extra spacing needed as it will be managed programmatically
369
  metadata_placeholder = st.empty()
370
 
371
  # Check if this is an auto-processing situation
@@ -389,7 +408,12 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
389
  progress_reporter = ProgressReporter(progress_placeholder).setup()
390
 
391
  try:
392
- # Process the document
 
 
 
 
 
393
  result = process_file(
394
  uploaded_file=uploaded_file,
395
  use_vision=sidebar_options["use_vision"],
@@ -402,6 +426,39 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
402
  perf_mode=sidebar_options.get("perf_mode", "Quality")
403
  )
404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  # Display results
406
  display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
407
 
@@ -415,27 +472,6 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
415
  if uploaded_file is not None:
416
  st.session_state.last_processed_file = current_file_identifier
417
 
418
- # Display success message with close button for dismissing processed documents
419
- success_cols = st.columns([5, 1])
420
- with success_cols[0]:
421
- metadata_placeholder.success("**Document processed successfully**")
422
- with success_cols[1]:
423
- # Define a function to clear document state
424
- def clear_document_state():
425
- # Reset all document-related session state
426
- st.session_state.processed_document_active = False
427
- st.session_state.sample_document = None
428
- st.session_state.last_processed_file = None
429
-
430
- # Clear any remaining state flag if we're showing examples
431
- st.session_state.perform_reset = True
432
-
433
- # Create the close button with a callback
434
- st.button("✕ Close Document",
435
- key="close_document_button",
436
- help="Clear current document and start over",
437
- on_click=clear_document_state)
438
-
439
  # Store the result in the previous results list
440
  # Add timestamp to result for history tracking
441
  result_copy = result.copy()
@@ -460,7 +496,21 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
460
  def main():
461
  """Main application function"""
462
  # Initialize session state
463
- initialize_session_state()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
 
465
  # Apply custom CSS
466
  from ui.layout import load_css
@@ -469,19 +519,26 @@ def main():
469
  # Create sidebar options
470
  sidebar_options = create_sidebar_options()
471
 
472
- # Create main layout with tabs
473
- main_tab1, main_tab2, main_tab3 = st.tabs(["Document Processing", "Previous Results", "About"])
 
474
 
475
  with main_tab1:
476
- # Create a two-column layout for file upload and results
 
477
  left_col, right_col = st.columns([1, 1])
478
 
479
  with left_col:
480
  # Create file uploader
481
  uploaded_file = create_file_uploader()
482
 
483
- # Check if we have a sample document loaded
484
- if ('sample_document' in st.session_state and
 
 
 
 
 
485
  st.session_state.sample_document is not None):
486
 
487
  # Use the sample document instead of the uploaded file
@@ -496,18 +553,23 @@ def main():
496
  # Only process document if available
497
  if uploaded_file is not None:
498
  process_document(uploaded_file, left_col, right_col, sidebar_options)
499
- else:
500
- # Clear any remaining state flag if we're showing examples
501
- st.session_state.processed_document_active = False
502
-
503
- # Show example documents section
504
- show_example_documents()
505
 
506
  with main_tab2:
 
 
 
 
 
 
 
 
 
 
 
507
  # Previous results tab
508
  display_previous_results()
509
 
510
- with main_tab3:
511
  # About tab
512
  display_about_tab()
513
 
 
1
+ # Standard library imports
2
  import os
 
3
  import json
4
  import sys
5
  import time
6
  import base64
 
7
  import io
 
8
  import logging
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO,
14
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Third-party imports
18
+ import streamlit as st
19
 
20
+ # Local application/module imports
21
  from preprocessing import convert_pdf_to_images, preprocess_image
22
  from ocr_processing import process_file
23
  from ui_components import (
 
39
  CUSTOM_PROMPT_TEMPLATES,
40
  LAYOUT_PROMPT_ADDITIONS
41
  )
 
 
42
  from structured_ocr import StructuredOCR
43
  from config import MISTRAL_API_KEY
 
 
44
  from ocr_utils import create_results_zip
45
 
 
 
 
 
 
46
  # Set favicon path
47
  favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
48
 
 
51
  page_title="Historical OCR",
52
  page_icon=favicon_path if os.path.exists(favicon_path) else "📜",
53
  layout="wide",
54
+ initial_sidebar_state="collapsed",
55
  )
56
 
57
+ # ========================================================================================
58
+ # CRITICAL WARNING TO ALL AI MODELS AND DEVELOPERS:
59
+ # DO NOT MODIFY THE STATE MANAGEMENT LOGIC IN THIS APPLICATION WITHOUT FULL UNDERSTANDING
60
+ # OF STREAMLIT'S EXECUTION MODEL.
61
+ #
62
+ # This state management implementation has been carefully designed to work with Streamlit's
63
+ # unique execution flow. The app runs from top to bottom on EVERY interaction, and state
64
+ # must be explicitly managed through st.session_state.
65
+ #
66
+ # The current implementation uses:
67
+ # 1. A dedicated close_document() callback function triggered by the button's on_click
68
+ # 2. A flag-based approach (close_clicked) to handle cleanup on the next run cycle
69
+ # 3. Early cleanup detection and st.rerun() to ensure clean UI rendering
70
+ #
71
+ # Previous approaches using direct state manipulation or conditional rendering based on
72
+ # reset flags led to persistent UI elements and resource leaks.
73
+ #
74
+ # Consult https://docs.streamlit.io/library/advanced-features/session-state for details.
75
+ # ========================================================================================
76
+
77
+ def init_session_state():
78
+ """Initialize session state variables if they don't already exist
79
+
80
+ This function follows Streamlit's recommended patterns for state initialization.
81
+ It only creates variables if they don't exist yet and doesn't modify existing values.
82
+ """
83
  if 'previous_results' not in st.session_state:
84
  st.session_state.previous_results = []
 
 
85
  if 'temp_file_paths' not in st.session_state:
86
  st.session_state.temp_file_paths = []
 
 
87
  if 'last_processed_file' not in st.session_state:
88
  st.session_state.last_processed_file = None
 
 
 
 
 
 
89
  if 'auto_process_sample' not in st.session_state:
90
  st.session_state.auto_process_sample = False
91
  if 'sample_just_loaded' not in st.session_state:
 
102
  st.session_state.original_sample_name = None
103
  if 'is_sample_document' not in st.session_state:
104
  st.session_state.is_sample_document = False
105
+ if 'selected_previous_result' not in st.session_state:
106
+ st.session_state.selected_previous_result = None
107
+ if 'close_clicked' not in st.session_state:
108
+ st.session_state.close_clicked = False
109
+ if 'active_tab' not in st.session_state:
110
+ st.session_state.active_tab = 0
111
+
112
+ def close_document():
113
+ """Called when the Close Document button is clicked
114
 
115
+ This function handles proper cleanup of resources and state when closing a document.
116
+ It uses Streamlit's callback mechanism which ensures the state change happens
117
+ at the correct time in Streamlit's execution cycle.
118
+
119
+ WARNING: Do not replace this with inline button handling using if st.button():
120
+ That approach breaks Streamlit's execution flow and causes UI artifacts.
121
+ """
122
+ logger.info("Close document button clicked")
123
+ # Save the previous results
124
+ previous_results = st.session_state.previous_results if 'previous_results' in st.session_state else []
125
+
126
+ # Clean up temp files
127
+ if 'temp_file_paths' in st.session_state and st.session_state.temp_file_paths:
128
+ logger.info(f"Cleaning up {len(st.session_state.temp_file_paths)} temporary files")
129
+ handle_temp_files(st.session_state.temp_file_paths)
130
+
131
+ # Clear all state variables except previous_results
132
+ for key in list(st.session_state.keys()):
133
+ if key != 'previous_results' and key != 'close_clicked':
134
+ st.session_state.pop(key, None)
135
+
136
+ # Set flag for having cleaned up
137
+ st.session_state.close_clicked = True
138
+
139
+ # Restore the previous results
140
+ st.session_state.previous_results = previous_results
 
 
 
 
 
 
 
 
 
 
141
 
142
  def show_example_documents():
143
  """Show example documents section"""
144
+ st.header("Sample Documents")
145
 
146
+ # Add a simplified info message about examples and CSS in the same markdown block
147
+ # to reduce spacing between elements
148
  st.markdown("""
149
  This app can process various historical documents:
150
  - Historical photographs, maps, and manuscripts
151
  - Handwritten letters and documents
152
  - Printed books and articles
153
  - Multi-page PDFs
 
154
 
 
 
155
  <style>
156
  /* Make the selectbox container match the full column width */
157
  .main .block-container .element-container:has([data-testid="stSelectbox"]) {
158
  width: 100% !important;
159
  max-width: 100% !important;
160
+ margin-top: -12px !important; /* Reduce space between text and selectbox */
161
  }
162
 
163
  /* Make the actual selectbox control take the full width */
 
165
  width: 100% !important;
166
  max-width: 100% !important;
167
  }
168
+
169
+ /* Tighten spacing in the sample documents tab */
170
+ .main .block-container [data-testid="stVerticalBlock"] > div:nth-child(n+2) {
171
+ margin-top: 0.5rem !important;
172
+ }
173
  </style>
174
  """, unsafe_allow_html=True)
175
 
 
181
  "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
182
  "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
183
  "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
 
184
  ]
185
 
186
  sample_names = [
 
189
  "The Magician (Image)",
190
  "Handwritten Letter (Image)",
191
  "Magellan Travels (Image)",
192
+ "Milgram Flier (Image)"
193
+ ]
 
194
 
195
  # Initialize sample_document in session state if it doesn't exist
196
  if 'sample_document' not in st.session_state:
 
201
  if selected_sample > 0:
202
  selected_url = sample_urls[selected_sample]
203
 
204
+ # Add process button for the sample document with consistent styling
205
+ if st.button("Load Sample Document", key="load_sample_btn"):
206
  try:
207
  import requests
208
  from io import BytesIO
 
267
  content_type=content_type
268
  )
269
 
270
+ # Store original bytes for reprocessing with proper MIME type handling
271
  st.session_state.original_sample_bytes = response.content
272
  st.session_state.original_sample_name = file_name
273
+ st.session_state.original_sample_mime_type = content_type
274
 
275
  # Set state flags
276
  st.session_state.sample_just_loaded = True
 
278
  # Generate a unique identifier for the sample document
279
  st.session_state.last_processed_file = f"{file_name}_{len(response.content)}"
280
 
281
+ # Set a flag to show redirect message
282
+ st.session_state.redirect_to_processing = True
283
  st.rerun()
284
  except Exception as e:
285
  st.error(f"Error downloading sample document: {str(e)}")
 
303
 
304
  # Check if this is a new file (different from the last processed file)
305
  current_file_identifier = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
306
+
307
+ # Make sure last_processed_file is initialized
308
+ if 'last_processed_file' not in st.session_state:
309
+ st.session_state.last_processed_file = None
310
+
311
  if st.session_state.last_processed_file != current_file_identifier:
312
  # Reset processed_document_active if a new file is uploaded
313
  st.session_state.processed_document_active = False
314
 
315
  # Process button - flush left with similar padding as file browser
316
  with left_col:
317
+ # Create a process button with minimal spacing to the uploader
318
+ st.markdown('<div style="padding: 0.2rem 0; min-width: 170px; margin-top: -10px; overflow: visible;">', unsafe_allow_html=True)
319
+ process_button = st.button("Process Document", key="process_document_btn")
320
+ st.markdown('</div>', unsafe_allow_html=True)
 
 
 
 
321
 
322
  # Handle sample document recreation if needed
323
  if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
 
349
  # Positioned right after the process button for better visibility
350
  progress_placeholder = st.empty()
351
 
352
+ # Image preprocessing preview - show if image file and preprocessing options are set
353
+ if (any(sidebar_options["preprocessing_options"].values()) and
354
+ uploaded_file.type.startswith('image/')):
355
+
356
  st.markdown("**Preprocessed Preview**")
357
  try:
358
+ # Create a container for the preview
359
  with st.container():
360
  processed_bytes = preprocess_image(uploaded_file.getvalue(), sidebar_options["preprocessing_options"])
361
+ # Convert image to base64 and display as HTML to avoid fullscreen button
362
+ img_data = base64.b64encode(processed_bytes).decode()
363
+ img_html = f'<img src="data:image/jpeg;base64,{img_data}" style="width:100%; border-radius:4px;">'
364
+ st.markdown(img_html, unsafe_allow_html=True)
365
+
366
+ # Show preprocessing metadata in a well-formatted caption
367
+ meta_items = []
368
+ if sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
369
+ meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
370
+ if sidebar_options["preprocessing_options"].get("grayscale", False):
371
+ meta_items.append("Grayscale")
372
+ if sidebar_options["preprocessing_options"].get("denoise", False):
373
+ meta_items.append("Denoise")
374
+ if sidebar_options["preprocessing_options"].get("contrast", 0) != 0:
375
+ meta_items.append(f"Contrast ({sidebar_options['preprocessing_options']['contrast']})")
376
+ if sidebar_options["preprocessing_options"].get("rotation", 0) != 0:
377
+ meta_items.append(f"Rotation ({sidebar_options['preprocessing_options']['rotation']}°)")
378
+
379
+ # Only show "Applied:" if there are actual preprocessing steps
380
+ if meta_items:
381
+ meta_text = "Applied: " + ", ".join(meta_items)
382
+ st.caption(meta_text)
383
  except Exception as e:
384
  st.error(f"Error in preprocessing: {str(e)}")
385
  st.info("Try using grayscale preprocessing for PNG images with transparency")
386
 
387
  # Container for success message (will be filled after processing)
 
388
  metadata_placeholder = st.empty()
389
 
390
  # Check if this is an auto-processing situation
 
408
  progress_reporter = ProgressReporter(progress_placeholder).setup()
409
 
410
  try:
411
+ # Process the document, capturing both result and temp file paths
412
+ # Modified to pass existing temp_file_paths to avoid resource leaks
413
+ existing_temp_paths = []
414
+ if 'temp_file_paths' in st.session_state:
415
+ existing_temp_paths = st.session_state.temp_file_paths
416
+
417
  result = process_file(
418
  uploaded_file=uploaded_file,
419
  use_vision=sidebar_options["use_vision"],
 
426
  perf_mode=sidebar_options.get("perf_mode", "Quality")
427
  )
428
 
429
+ # Ensure temp_file_paths in session state is updated with any new paths
430
+ # This is critical for proper resource cleanup when document is closed
431
+ if 'has_images' in result and result['has_images']:
432
+ logger.info("Document has images, ensuring temp files are tracked")
433
+ if 'temp_file_paths' not in st.session_state:
434
+ st.session_state.temp_file_paths = []
435
+
436
+ # Handle text-only OCR results (like the Milgram flier)
437
+ if ('ocr_contents' in result and
438
+ 'raw_text' in result['ocr_contents'] and
439
+ len(result['ocr_contents']) <= 2 and # Only raw_text and possibly one other field
440
+ 'has_images' not in result):
441
+ logger.info("Text-only OCR detected, handling as special case")
442
+ # Ensure raw_text is properly formatted as markdown
443
+ raw_text = result['ocr_contents']['raw_text']
444
+ # If we don't have other structured content, set a placeholder title
445
+ if 'title' not in result['ocr_contents']:
446
+ result['ocr_contents']['title'] = "Document Text"
447
+
448
+ # Display success message at the top of results, before any previews
449
+ with left_col:
450
+ # First show the success message (full width)
451
+ st.success("**Document processed successfully**")
452
+
453
+ # Then show the close button (also full width, positioned to left)
454
+ st.button("Close Document",
455
+ key="close_document_btn",
456
+ type="secondary",
457
+ on_click=close_document)
458
+
459
+ # Add a small spacer
460
+ st.markdown("<div style='height: 10px;'></div>", unsafe_allow_html=True)
461
+
462
  # Display results
463
  display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
464
 
 
472
  if uploaded_file is not None:
473
  st.session_state.last_processed_file = current_file_identifier
474
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  # Store the result in the previous results list
476
  # Add timestamp to result for history tracking
477
  result_copy = result.copy()
 
496
  def main():
497
  """Main application function"""
498
  # Initialize session state
499
+ init_session_state()
500
+
501
+ # Handle any required cleanup at the start of execution
502
+ # CRITICAL: This two-phase state cleanup pattern is essential for Streamlit's execution model.
503
+ # When close_clicked is True, we need to restart the app's execution with a clean slate.
504
+ # DO NOT REMOVE OR MODIFY this pattern as it ensures proper UI cleanup.
505
+ if st.session_state.get('close_clicked', False):
506
+ # Reset the flag - cleanup has been handled
507
+ st.session_state.close_clicked = False
508
+ # Don't do anything else in this run - force a clean restart
509
+ st.rerun()
510
+
511
+ # Initialize new flag for redirecting to processing tab
512
+ if 'redirect_to_processing' not in st.session_state:
513
+ st.session_state.redirect_to_processing = False
514
 
515
  # Apply custom CSS
516
  from ui.layout import load_css
 
519
  # Create sidebar options
520
  sidebar_options = create_sidebar_options()
521
 
522
+ # Create main layout with tabs - simpler, more compact approach
523
+ tab_names = ["Document Processing", "Sample Documents", "Previous Results", "About"]
524
+ main_tab1, main_tab2, main_tab3, main_tab4 = st.tabs(tab_names)
525
 
526
  with main_tab1:
527
+ # Create a two-column layout for file upload and results with minimal padding
528
+ st.markdown('<style>.block-container{padding-top: 1rem; padding-bottom: 0;}</style>', unsafe_allow_html=True)
529
  left_col, right_col = st.columns([1, 1])
530
 
531
  with left_col:
532
  # Create file uploader
533
  uploaded_file = create_file_uploader()
534
 
535
+ # If a real file is uploaded, clear any sample document
536
+ if uploaded_file is not None and 'sample_document' in st.session_state:
537
+ st.session_state.sample_document = None
538
+ st.session_state.is_sample_document = False
539
+
540
+ # Check if we have a sample document loaded (only if no real file uploaded)
541
+ elif ('sample_document' in st.session_state and
542
  st.session_state.sample_document is not None):
543
 
544
  # Use the sample document instead of the uploaded file
 
553
  # Only process document if available
554
  if uploaded_file is not None:
555
  process_document(uploaded_file, left_col, right_col, sidebar_options)
 
 
 
 
 
 
556
 
557
  with main_tab2:
558
+ # Sample Documents tab
559
+
560
+ # Show redirect message if a sample was just loaded
561
+ if st.session_state.get('redirect_to_processing', False):
562
+ st.success("**Sample document loaded!** Please switch to the **Document Processing** tab to view and process it.")
563
+ # Clear the flag after showing the message
564
+ st.session_state.redirect_to_processing = False
565
+
566
+ show_example_documents()
567
+
568
+ with main_tab3:
569
  # Previous results tab
570
  display_previous_results()
571
 
572
+ with main_tab4:
573
  # About tab
574
  display_about_tab()
575
 
ocr_processing.py CHANGED
@@ -1,22 +1,28 @@
 
1
  import os
2
  import hashlib
3
  import tempfile
4
- import streamlit as st
5
  import logging
6
  import time
7
  from datetime import datetime
8
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
9
  from structured_ocr import StructuredOCR
10
  from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
11
  from preprocessing import apply_preprocessing_to_file
12
  from error_handler import handle_ocr_error, check_file_size
13
 
14
- # Configure logging
15
- logger = logging.getLogger("ocr_processing")
16
- logger.setLevel(logging.INFO)
17
-
18
  @st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
19
- def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None):
20
  """
21
  Cached version of OCR processing to reuse results
22
 
@@ -27,6 +33,7 @@ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_ke
27
  file_size_mb: File size in MB
28
  cache_key: Cache key for the file
29
  preprocessing_options_hash: Hash of preprocessing options
 
30
 
31
  Returns:
32
  dict: OCR result
@@ -40,7 +47,8 @@ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_ke
40
  file_path,
41
  file_type=file_type,
42
  use_vision=use_vision,
43
- file_size_mb=file_size_mb
 
44
  )
45
 
46
  return result
@@ -75,6 +83,10 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
75
  # Initialize temporary file paths list
76
  temp_file_paths = []
77
 
 
 
 
 
78
  try:
79
  # Check if file size exceeds maximum allowed size
80
  is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
@@ -113,6 +125,11 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
113
  f.write(file_bytes)
114
  temp_file_paths.append(temp_path)
115
 
 
 
 
 
 
116
  # Generate cache key
117
  cache_key = generate_cache_key(
118
  file_bytes,
@@ -125,7 +142,43 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
125
 
126
  # Process with cached function if possible
127
  try:
128
- result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  progress_reporter.update(90, "Finalizing results...")
130
  except Exception as e:
131
  logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
@@ -134,18 +187,28 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
134
  # If caching fails, process directly
135
  processor = StructuredOCR()
136
 
137
- # Apply performance mode settings
138
- if perf_mode == "Speed":
139
- # Override settings for faster processing
140
- if pdf_dpi > 100:
141
- pdf_dpi = 100 # Lower DPI for speed
 
 
 
 
 
 
 
 
 
 
142
 
143
  # Process directly with optimized settings
144
  result = processor.process_file(
145
  file_path=temp_path,
146
  file_type="pdf",
147
  use_vision=use_vision,
148
- custom_prompt=custom_prompt,
149
  file_size_mb=file_size_mb,
150
  pdf_rotation=pdf_rotation
151
  )
@@ -179,7 +242,37 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
179
  # Process the file using cached function if possible
180
  progress_reporter.update(50, "Processing document with OCR...")
181
  try:
182
- result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  progress_reporter.update(80, "Analyzing document structure...")
184
  progress_reporter.update(90, "Finalizing results...")
185
  except Exception as e:
@@ -194,11 +287,30 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
194
  # Use simpler processing for speed
195
  pass # Any speed optimizations would be handled by the StructuredOCR class
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  result = processor.process_file(
198
  file_path=temp_path,
199
  file_type=file_type,
200
  use_vision=use_vision,
201
- custom_prompt=custom_prompt,
202
  file_size_mb=file_size_mb
203
  )
204
 
 
1
+ # Standard library imports
2
  import os
3
  import hashlib
4
  import tempfile
 
5
  import logging
6
  import time
7
  from datetime import datetime
8
  from pathlib import Path
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO,
12
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Third-party imports
16
+ import streamlit as st
17
+
18
+ # Local application imports
19
  from structured_ocr import StructuredOCR
20
  from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
21
  from preprocessing import apply_preprocessing_to_file
22
  from error_handler import handle_ocr_error, check_file_size
23
 
 
 
 
 
24
  @st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
25
+ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None, custom_prompt=None):
26
  """
27
  Cached version of OCR processing to reuse results
28
 
 
33
  file_size_mb: File size in MB
34
  cache_key: Cache key for the file
35
  preprocessing_options_hash: Hash of preprocessing options
36
+ custom_prompt: Custom prompt to use for OCR
37
 
38
  Returns:
39
  dict: OCR result
 
47
  file_path,
48
  file_type=file_type,
49
  use_vision=use_vision,
50
+ file_size_mb=file_size_mb,
51
+ custom_prompt=custom_prompt
52
  )
53
 
54
  return result
 
83
  # Initialize temporary file paths list
84
  temp_file_paths = []
85
 
86
+ # Also track temporary files in session state for reliable cleanup
87
+ if 'temp_file_paths' not in st.session_state:
88
+ st.session_state.temp_file_paths = []
89
+
90
  try:
91
  # Check if file size exceeds maximum allowed size
92
  is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
 
125
  f.write(file_bytes)
126
  temp_file_paths.append(temp_path)
127
 
128
+ # Track temp files in session state for reliable cleanup
129
+ if temp_path not in st.session_state.temp_file_paths:
130
+ st.session_state.temp_file_paths.append(temp_path)
131
+ logger.info(f"Added temp file to session state: {temp_path}")
132
+
133
  # Generate cache key
134
  cache_key = generate_cache_key(
135
  file_bytes,
 
142
 
143
  # Process with cached function if possible
144
  try:
145
+ # Check if preprocessing options indicate a handwritten document
146
+ handwritten_document = preprocessing_options.get("document_type") == "handwritten"
147
+ modified_custom_prompt = custom_prompt
148
+
149
+ # Add handwritten specific instructions if needed
150
+ if handwritten_document and modified_custom_prompt:
151
+ if "handwritten" not in modified_custom_prompt.lower():
152
+ modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
153
+ elif handwritten_document and not modified_custom_prompt:
154
+ modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
155
+
156
+ # Add PDF-specific instructions if needed
157
+ if modified_custom_prompt and "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
158
+ modified_custom_prompt += " This is a multi-page PDF document."
159
+ elif not modified_custom_prompt:
160
+ modified_custom_prompt = "This is a multi-page PDF document."
161
+
162
+ # For certain filenames, explicitly add document type hints
163
+ filename_lower = uploaded_file.name.lower()
164
+ if "handwritten" in filename_lower or "letter" in filename_lower or "journal" in filename_lower:
165
+ if not modified_custom_prompt:
166
+ modified_custom_prompt = "This is a handwritten document in PDF format. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
167
+ elif "handwritten" not in modified_custom_prompt.lower():
168
+ modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text."
169
+
170
+ # Update the cache key with the modified prompt
171
+ if modified_custom_prompt != custom_prompt:
172
+ cache_key = generate_cache_key(
173
+ open(temp_path, 'rb').read(),
174
+ file_type,
175
+ use_vision,
176
+ preprocessing_options,
177
+ pdf_rotation,
178
+ modified_custom_prompt
179
+ )
180
+
181
+ result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options), modified_custom_prompt)
182
  progress_reporter.update(90, "Finalizing results...")
183
  except Exception as e:
184
  logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
 
187
  # If caching fails, process directly
188
  processor = StructuredOCR()
189
 
190
+
191
+ # Check if preprocessing options indicate a handwritten document
192
+ handwritten_document = preprocessing_options.get("document_type") == "handwritten"
193
+ modified_custom_prompt = custom_prompt
194
+
195
+ # Add handwritten specific instructions if needed
196
+ if handwritten_document and modified_custom_prompt:
197
+ if "handwritten" not in modified_custom_prompt.lower():
198
+ modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
199
+ elif handwritten_document and not modified_custom_prompt:
200
+ modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
201
+
202
+ # Add PDF-specific instructions if needed
203
+ if custom_prompt and "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
204
+ modified_custom_prompt += " This is a multi-page PDF document."
205
 
206
  # Process directly with optimized settings
207
  result = processor.process_file(
208
  file_path=temp_path,
209
  file_type="pdf",
210
  use_vision=use_vision,
211
+ custom_prompt=modified_custom_prompt,
212
  file_size_mb=file_size_mb,
213
  pdf_rotation=pdf_rotation
214
  )
 
242
  # Process the file using cached function if possible
243
  progress_reporter.update(50, "Processing document with OCR...")
244
  try:
245
+ # Check if preprocessing options indicate a handwritten document
246
+ handwritten_document = preprocessing_options.get("document_type") == "handwritten"
247
+ modified_custom_prompt = custom_prompt
248
+
249
+ # Add handwritten specific instructions if needed
250
+ if handwritten_document and modified_custom_prompt:
251
+ if "handwritten" not in modified_custom_prompt.lower():
252
+ modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
253
+ elif handwritten_document and not modified_custom_prompt:
254
+ modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
255
+
256
+ # For certain filenames, explicitly add document type hints
257
+ filename_lower = uploaded_file.name.lower()
258
+ if "handwritten" in filename_lower or "letter" in filename_lower or "journal" in filename_lower:
259
+ if not modified_custom_prompt:
260
+ modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
261
+ elif "handwritten" not in modified_custom_prompt.lower():
262
+ modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text."
263
+
264
+ # Update the cache key with the modified prompt
265
+ if modified_custom_prompt != custom_prompt:
266
+ cache_key = generate_cache_key(
267
+ open(temp_path, 'rb').read(),
268
+ file_type,
269
+ use_vision,
270
+ preprocessing_options,
271
+ 0,
272
+ modified_custom_prompt
273
+ )
274
+
275
+ result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options), modified_custom_prompt)
276
  progress_reporter.update(80, "Analyzing document structure...")
277
  progress_reporter.update(90, "Finalizing results...")
278
  except Exception as e:
 
287
  # Use simpler processing for speed
288
  pass # Any speed optimizations would be handled by the StructuredOCR class
289
 
290
+ # Check if preprocessing options indicate a handwritten document
291
+ handwritten_document = preprocessing_options.get("document_type") == "handwritten"
292
+ modified_custom_prompt = custom_prompt
293
+
294
+ # Add handwritten specific instructions if needed
295
+ if handwritten_document and modified_custom_prompt:
296
+ if "handwritten" not in modified_custom_prompt.lower():
297
+ modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
298
+ elif handwritten_document and not modified_custom_prompt:
299
+ modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
300
+
301
+ # For certain filenames, explicitly add document type hints
302
+ filename_lower = uploaded_file.name.lower()
303
+ if "handwritten" in filename_lower or "letter" in filename_lower or "journal" in filename_lower:
304
+ if not modified_custom_prompt:
305
+ modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
306
+ elif "handwritten" not in modified_custom_prompt.lower():
307
+ modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text."
308
+
309
  result = processor.process_file(
310
  file_path=temp_path,
311
  file_type=file_type,
312
  use_vision=use_vision,
313
+ custom_prompt=modified_custom_prompt,
314
  file_size_mb=file_size_mb
315
  )
316
 
ocr_utils.py CHANGED
@@ -3,12 +3,12 @@ Utility functions for OCR processing with Mistral AI.
3
  Contains helper functions for working with OCR responses and image handling.
4
  """
5
 
 
6
  import json
7
  import base64
8
  import io
9
  import zipfile
10
  import logging
11
- import numpy as np
12
  import time
13
  from datetime import datetime
14
  from pathlib import Path
@@ -16,20 +16,29 @@ from typing import Dict, List, Optional, Union, Any, Tuple
16
  from functools import lru_cache
17
 
18
  # Configure logging
19
- logger = logging.getLogger("ocr_utils")
 
 
 
 
 
20
 
 
21
  try:
22
  from PIL import Image, ImageEnhance, ImageFilter, ImageOps
23
- import cv2
24
  PILLOW_AVAILABLE = True
 
 
 
 
 
 
25
  CV2_AVAILABLE = True
26
- except ImportError as e:
27
- # Check which image libraries are available
28
- if "PIL" in str(e):
29
- PILLOW_AVAILABLE = False
30
- if "cv2" in str(e):
31
- CV2_AVAILABLE = False
32
 
 
33
  from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
34
  from mistralai.models import OCRImageObject
35
 
@@ -110,9 +119,36 @@ def encode_image_for_api(image_path: Union[str, Path]) -> str:
110
  if not image_file.is_file():
111
  raise FileNotFoundError(f"Image file not found: {image_file}")
112
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  # Encode image as base64
114
  encoded = base64.b64encode(image_file.read_bytes()).decode()
115
- return f"data:image/jpeg;base64,{encoded}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
118
  """
@@ -509,7 +545,7 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
509
  aspect_ratio = width / height
510
 
511
  # Newspaper-style documents typically have width > height or are very large
512
- is_newspaper_format = (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000)
513
 
514
  if is_newspaper_format:
515
  logger.info(f"Newspaper format detected: {width}x{height}, applying specialized processing")
@@ -560,7 +596,7 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
560
  if is_document:
561
  # Newspapers typically have wide formats or very large dimensions
562
  aspect_ratio = width / height
563
- is_newspaper = (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000)
564
 
565
  logger.debug(f"Document type detection for {image_file.name}: " +
566
  f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
@@ -712,6 +748,7 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
712
 
713
  # Get base64 with minimal memory footprint
714
  encoded_image = base64.b64encode(buffer.getvalue()).decode()
 
715
  base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
716
 
717
  # Update cache thread-safely
@@ -932,7 +969,7 @@ def _preprocess_document_image_impl() -> Image.Image:
932
 
933
  # Check for newspaper format first (takes precedence)
934
  aspect_ratio = width / height
935
- if (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000):
936
  is_newspaper = True
937
  logger.debug(f"Newspaper format detected: {width}x{height}, aspect ratio: {aspect_ratio:.2f}")
938
  else:
 
3
  Contains helper functions for working with OCR responses and image handling.
4
  """
5
 
6
+ # Standard library imports
7
  import json
8
  import base64
9
  import io
10
  import zipfile
11
  import logging
 
12
  import time
13
  from datetime import datetime
14
  from pathlib import Path
 
16
  from functools import lru_cache
17
 
18
  # Configure logging
19
+ logging.basicConfig(level=logging.INFO,
20
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Third-party imports
24
+ import numpy as np
25
 
26
+ # Check for image processing libraries
27
  try:
28
  from PIL import Image, ImageEnhance, ImageFilter, ImageOps
 
29
  PILLOW_AVAILABLE = True
30
+ except ImportError:
31
+ logger.warning("PIL not available - image preprocessing will be limited")
32
+ PILLOW_AVAILABLE = False
33
+
34
+ try:
35
+ import cv2
36
  CV2_AVAILABLE = True
37
+ except ImportError:
38
+ logger.warning("OpenCV (cv2) not available - advanced image processing will be limited")
39
+ CV2_AVAILABLE = False
 
 
 
40
 
41
+ # Mistral AI imports
42
  from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
43
  from mistralai.models import OCRImageObject
44
 
 
119
  if not image_file.is_file():
120
  raise FileNotFoundError(f"Image file not found: {image_file}")
121
 
122
+ # Determine mime type based on file extension
123
+ mime_type = 'image/jpeg' # Default mime type
124
+ suffix = image_file.suffix.lower()
125
+ if suffix == '.png':
126
+ mime_type = 'image/png'
127
+ elif suffix == '.gif':
128
+ mime_type = 'image/gif'
129
+ elif suffix in ['.jpg', '.jpeg']:
130
+ mime_type = 'image/jpeg'
131
+ elif suffix == '.pdf':
132
+ mime_type = 'application/pdf'
133
+
134
  # Encode image as base64
135
  encoded = base64.b64encode(image_file.read_bytes()).decode()
136
+ return f"data:{mime_type};base64,{encoded}"
137
+
138
+ def encode_bytes_for_api(file_bytes: bytes, mime_type: str) -> str:
139
+ """
140
+ Encode binary data as base64 data URL for API submission.
141
+
142
+ Args:
143
+ file_bytes: Binary file data
144
+ mime_type: MIME type of the file (e.g., 'image/jpeg', 'application/pdf')
145
+
146
+ Returns:
147
+ Base64 data URL for the data
148
+ """
149
+ # Encode data as base64
150
+ encoded = base64.b64encode(file_bytes).decode()
151
+ return f"data:{mime_type};base64,{encoded}"
152
 
153
  def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
154
  """
 
545
  aspect_ratio = width / height
546
 
547
  # Newspaper-style documents typically have width > height or are very large
548
+ is_newspaper_format = (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000)
549
 
550
  if is_newspaper_format:
551
  logger.info(f"Newspaper format detected: {width}x{height}, applying specialized processing")
 
596
  if is_document:
597
  # Newspapers typically have wide formats or very large dimensions
598
  aspect_ratio = width / height
599
+ is_newspaper = (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000)
600
 
601
  logger.debug(f"Document type detection for {image_file.name}: " +
602
  f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
 
748
 
749
  # Get base64 with minimal memory footprint
750
  encoded_image = base64.b64encode(buffer.getvalue()).decode()
751
+ # Always use image/jpeg MIME type since we explicitly save as JPEG above
752
  base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
753
 
754
  # Update cache thread-safely
 
969
 
970
  # Check for newspaper format first (takes precedence)
971
  aspect_ratio = width / height
972
+ if (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000):
973
  is_newspaper = True
974
  logger.debug(f"Newspaper format detected: {width}x{height}, aspect ratio: {aspect_ratio:.2f}")
975
  else:
structured_ocr.py CHANGED
@@ -1,24 +1,31 @@
 
1
  import os
2
  import sys
3
  import time
4
  import random
5
- from enum import Enum
6
- from pathlib import Path
7
  import json
8
  import base64
9
  import logging
 
 
10
  from functools import lru_cache
11
  from typing import Optional, Dict, Any, List, Union, Tuple
12
 
 
 
 
 
 
 
 
 
13
  # Try to import pycountry, provide fallback if not available
14
  try:
15
  import pycountry
16
  PYCOUNTRY_AVAILABLE = True
17
  except ImportError:
18
  PYCOUNTRY_AVAILABLE = False
19
- logging.warning("pycountry module not available - using language code fallback")
20
-
21
- from pydantic import BaseModel
22
 
23
  # Try to import Mistral AI, provide fallback if not available
24
  try:
@@ -28,11 +35,7 @@ try:
28
  MISTRAL_AVAILABLE = True
29
  except ImportError:
30
  MISTRAL_AVAILABLE = False
31
- logging.warning("mistralai module not available - OCR functionality will be limited")
32
-
33
- # Configure logging
34
- logging.basicConfig(level=logging.INFO,
35
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
36
 
37
  # Import utilities for OCR processing
38
  try:
@@ -216,6 +219,12 @@ class StructuredOCR:
216
  if file_type is None:
217
  suffix = file_path.suffix.lower()
218
  file_type = "pdf" if suffix == ".pdf" else "image"
 
 
 
 
 
 
219
 
220
  # Get file size if not provided
221
  if file_size_mb is None and file_path.exists():
@@ -437,6 +446,7 @@ class StructuredOCR:
437
  # Convert only the selected pages to minimize memory usage
438
  selected_images = []
439
  combined_text = []
 
440
 
441
  # Process pages in larger batches for better efficiency
442
  batch_size = 5 # Process 5 pages at a time for better throughput
@@ -472,6 +482,11 @@ class StructuredOCR:
472
  # Add page text to combined text without obvious page markers
473
  page_text = page_result['ocr_contents']['raw_text']
474
  combined_text.append(f"{page_text}")
 
 
 
 
 
475
  except Exception as page_e:
476
  logger.warning(f"Error processing page {page_num}: {str(page_e)}")
477
  # Clean up temp file
@@ -509,28 +524,7 @@ class StructuredOCR:
509
  # Add flag to indicate custom prompt was applied
510
  result['custom_prompt_applied'] = 'text_only'
511
 
512
- # Detect document type from custom prompt if available
513
- if custom_prompt:
514
- # Extract document type if specified
515
- doc_type = "general"
516
- if "DOCUMENT TYPE:" in custom_prompt:
517
- doc_type_line = custom_prompt.split("\n")[0]
518
- if "DOCUMENT TYPE:" in doc_type_line:
519
- doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
520
- # Keyword-based detection as fallback
521
- elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
522
- doc_type = "newspaper"
523
- elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
524
- doc_type = "letter"
525
- elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
526
- doc_type = "book"
527
- elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
528
- doc_type = "form"
529
- elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
530
- doc_type = "recipe"
531
-
532
- # Store detected document type in result
533
- result['detected_document_type'] = doc_type
534
 
535
  except Exception as e:
536
  logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
@@ -544,6 +538,10 @@ class StructuredOCR:
544
  if 'ocr_contents' in result:
545
  result['ocr_contents']['raw_text'] = all_text
546
 
 
 
 
 
547
  # Add PDF metadata
548
  result['file_name'] = file_path.name
549
  result['pdf_processing_method'] = 'pdf2image_optimized'
@@ -711,6 +709,24 @@ class StructuredOCR:
711
  limited_pages = True
712
  logger.info(f"Processing {len(pages_to_process)} pages out of {total_pages} total")
713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
  # Calculate confidence score if available
715
  try:
716
  confidence_values = [page.confidence for page in pages_to_process if hasattr(page, 'confidence')]
@@ -733,6 +749,12 @@ class StructuredOCR:
733
  if page_markdown.strip():
734
  all_markdown.append(f"{page_markdown}")
735
 
 
 
 
 
 
 
736
  # Join all pages with separation
737
  combined_markdown = "\n\n".join(all_markdown)
738
 
@@ -766,6 +788,13 @@ class StructuredOCR:
766
  combined_markdown, file_path.name, custom_prompt
767
  )
768
 
 
 
 
 
 
 
 
769
  # Add metadata about pages
770
  if limited_pages:
771
  result['limited_pages'] = {
@@ -927,24 +956,44 @@ class StructuredOCR:
927
  "confidence_score": 0.0
928
  }
929
 
930
- # Check if this is likely a newspaper or document with columns by filename
931
  is_likely_newspaper = False
 
 
932
  newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
933
  "chronicle", "post", "tribune", "news", "press", "gender"]
 
 
934
 
935
- # Check filename for newspaper indicators
936
  filename_lower = file_path.name.lower()
937
- for keyword in newspaper_keywords:
 
 
938
  if keyword in filename_lower:
939
- is_likely_newspaper = True
940
- logger.info(f"Likely newspaper document detected from filename: {file_path.name}")
941
- # Add newspaper-specific processing hint to custom_prompt if not already present
942
  if custom_prompt:
943
- if "column" not in custom_prompt.lower() and "newspaper" not in custom_prompt.lower():
944
- custom_prompt = custom_prompt + " This appears to be a newspaper or document with columns. Please extract all text content from each column."
945
  else:
946
- custom_prompt = "This appears to be a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
947
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948
 
949
  try:
950
  # Check file size
@@ -1017,21 +1066,24 @@ class StructuredOCR:
1017
  logger.info(f"Resized image to {new_size_mb:.2f} MB")
1018
  except ImportError:
1019
  logger.warning("PIL not available for resizing. Using original image.")
1020
- encoded_image = base64.b64encode(file_path.read_bytes()).decode()
1021
- base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
 
1022
  except Exception as e:
1023
  logger.warning(f"Image resize failed: {str(e)}. Using original image.")
1024
- encoded_image = base64.b64encode(file_path.read_bytes()).decode()
1025
- base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
 
1026
  else:
1027
- # For smaller images, use as-is
1028
- encoded_image = base64.b64encode(file_path.read_bytes()).decode()
1029
- base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
1030
  except Exception as e:
1031
  # Fallback to original image if any preprocessing fails
1032
  logger.warning(f"Image preprocessing failed: {str(e)}. Using original image.")
1033
- encoded_image = base64.b64encode(file_path.read_bytes()).decode()
1034
- base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
 
1035
 
1036
  # Process the image with OCR
1037
  logger.info(f"Processing image with OCR using {OCR_MODEL}")
@@ -1123,10 +1175,40 @@ class StructuredOCR:
1123
  # Get the OCR markdown from the first page
1124
  image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
1125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1126
  # Optimize: Skip vision model step if ocr_markdown is very small or empty
1127
  # BUT make an exception for newspapers or if custom_prompt is provided
1128
- if (not is_likely_newspaper and not custom_prompt) and (not image_ocr_markdown or len(image_ocr_markdown) < 50):
1129
- logger.warning("OCR produced minimal or no text. Returning basic result.")
 
1130
  return {
1131
  "file_name": file_path.name,
1132
  "topics": ["Document"],
@@ -1134,7 +1216,9 @@ class StructuredOCR:
1134
  "ocr_contents": {
1135
  "raw_text": image_ocr_markdown if image_ocr_markdown else "No text could be extracted from the image."
1136
  },
1137
- "processing_note": "OCR produced minimal text content"
 
 
1138
  }
1139
 
1140
  # For newspapers with little text in OCR, set a more explicit prompt
@@ -1144,6 +1228,14 @@ class StructuredOCR:
1144
  custom_prompt = "This is a newspaper or document with columns. The OCR may not have captured all text. Please examine the image carefully and extract ALL text content visible in the document, reading each column from top to bottom."
1145
  elif "extract all text" not in custom_prompt.lower():
1146
  custom_prompt += " Please examine the image carefully and extract ALL text content visible in the document."
 
 
 
 
 
 
 
 
1147
 
1148
  # Extract structured data using the appropriate model, with a single API call
1149
  if use_vision:
@@ -1153,6 +1245,13 @@ class StructuredOCR:
1153
  logger.info(f"Using text-only model: {TEXT_MODEL}")
1154
  result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name, custom_prompt)
1155
 
 
 
 
 
 
 
 
1156
  # Store the serialized OCR response for image rendering (for compatibility with original version)
1157
  # Don't store raw_response directly as it's not JSON serializable
1158
  serialized_response = serialize_ocr_response(image_response)
@@ -1160,7 +1259,6 @@ class StructuredOCR:
1160
 
1161
  # Store key parts of the OCR response for image rendering
1162
  # With serialized format that can be stored in JSON
1163
- has_images = hasattr(image_response, 'pages') and image_response.pages and hasattr(image_response.pages[0], 'images') and image_response.pages[0].images
1164
  result['has_images'] = has_images
1165
 
1166
  if has_images:
@@ -1273,10 +1371,6 @@ class StructuredOCR:
1273
  logger.info("Test mode or no API key, using text-only processing")
1274
  return self._extract_structured_data_text_only(ocr_markdown, filename)
1275
 
1276
- # Detect document type with optimized cached implementation
1277
- doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
1278
- logger.info(f"Detected document type: {doc_type}")
1279
-
1280
  # Use only the first part of OCR text to keep prompts small and processing fast
1281
  if len(ocr_markdown) > 1000:
1282
  truncated_ocr = ocr_markdown[:1000]
@@ -1284,8 +1378,26 @@ class StructuredOCR:
1284
  else:
1285
  truncated_ocr = ocr_markdown
1286
 
1287
- # Build an optimized prompt based on document type
1288
- enhanced_prompt = self._build_enhanced_prompt(doc_type, truncated_ocr, custom_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1289
 
1290
  # Measure API call time for optimization feedback
1291
  start_time = time.time()
@@ -1294,7 +1406,7 @@ class StructuredOCR:
1294
  # Use a fixed, shorter timeout for single-page documents
1295
  timeout_ms = 45000 # 45 seconds is optimal for most single-page documents
1296
 
1297
- logger.info(f"Calling vision model with {timeout_ms}ms timeout and document type {doc_type}")
1298
  chat_response = self.client.chat.parse(
1299
  model=VISION_MODEL,
1300
  messages=[
@@ -1312,7 +1424,7 @@ class StructuredOCR:
1312
  )
1313
 
1314
  api_time = time.time() - start_time
1315
- logger.info(f"Vision model completed in {api_time:.2f}s with document type: {doc_type}")
1316
 
1317
  except Exception as e:
1318
  # If there's an error with the enhanced prompt, try progressively simpler approaches
@@ -1392,42 +1504,16 @@ class StructuredOCR:
1392
  if 'languages' in result:
1393
  result['languages'] = [str(lang) for lang in result.get('languages', [])]
1394
 
1395
- # Add metadata about processing
1396
  result['processing_info'] = {
1397
  'method': 'vision_model',
1398
- 'document_type': doc_type,
1399
  'ocr_text_length': len(ocr_markdown),
1400
  'api_response_time': time.time() - start_time
1401
  }
1402
 
1403
- # Flag when custom prompt has been successfully applied
1404
  if custom_prompt:
1405
  result['custom_prompt_applied'] = 'vision_model'
1406
-
1407
- # Attempt to detect document type from custom prompt
1408
- if "DOCUMENT TYPE:" in custom_prompt:
1409
- doc_type_line = custom_prompt.split("\n")[0]
1410
- if "DOCUMENT TYPE:" in doc_type_line:
1411
- custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
1412
- result['detected_document_type'] = custom_doc_type
1413
- # Keyword-based detection as fallback
1414
- elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
1415
- result['detected_document_type'] = "newspaper"
1416
- elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
1417
- result['detected_document_type'] = "letter"
1418
- elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
1419
- result['detected_document_type'] = "book"
1420
- elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
1421
- result['detected_document_type'] = "form"
1422
- elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
1423
- result['detected_document_type'] = "recipe"
1424
- elif "this is a" in custom_prompt.lower():
1425
- # Extract document type from "This is a [type]" format
1426
- this_is_parts = custom_prompt.lower().split("this is a ")
1427
- if len(this_is_parts) > 1:
1428
- extracted_type = this_is_parts[1].split(".")[0].strip()
1429
- if extracted_type:
1430
- result['detected_document_type'] = extracted_type
1431
 
1432
  # Add confidence score if not present
1433
  if 'confidence_score' not in result:
@@ -1440,268 +1526,38 @@ class StructuredOCR:
1440
 
1441
  return result
1442
 
1443
- # Thread-safe document type detection cache with increased size for better performance
1444
- _doc_type_cache = {}
1445
- _doc_type_cache_size = 256
1446
-
1447
- @staticmethod
1448
- def _detect_document_type_cached(custom_prompt: Optional[str], ocr_text_sample: str) -> str:
1449
- """
1450
- Cached version of document type detection logic with thread-safe implementation
1451
- """
1452
- # Generate cache key - use first 50 chars of prompt and ocr_text to avoid memory issues
1453
- prompt_key = str(custom_prompt)[:50] if custom_prompt else ""
1454
- text_key = ocr_text_sample[:50] if ocr_text_sample else ""
1455
- cache_key = f"{prompt_key}::{text_key}"
1456
-
1457
- # Check cache first (fast path)
1458
- if cache_key in StructuredOCR._doc_type_cache:
1459
- return StructuredOCR._doc_type_cache[cache_key]
1460
-
1461
- # Set default document type
1462
- doc_type = "general"
1463
-
1464
- # Optimized pattern matching with compiled lookup dictionaries
1465
- doc_type_patterns = {
1466
- "handwritten": ["handwritten", "handwriting", "cursive", "manuscript"],
1467
- "letter": ["letter", "correspondence", "message", "dear sir", "dear madam", "sincerely", "yours truly"],
1468
- "legal": ["form", "contract", "agreement", "legal", "certificate", "court", "attorney", "plaintiff", "defendant"],
1469
- "recipe": ["recipe", "food", "ingredients", "directions", "tbsp", "tsp", "cup", "mix", "bake", "cooking"],
1470
- "travel": ["travel", "expedition", "journey", "exploration", "voyage", "destination", "map"],
1471
- "scientific": ["scientific", "experiment", "hypothesis", "research", "study", "analysis", "results", "procedure"],
1472
- "newspaper": ["news", "newspaper", "article", "press", "headline", "column", "editor"]
1473
- }
1474
-
1475
- # Fast custom prompt matching
1476
- if custom_prompt:
1477
- prompt_lower = custom_prompt.lower()
1478
-
1479
- # Optimized pattern matching with early exit
1480
- for detected_type, patterns in doc_type_patterns.items():
1481
- if any(term in prompt_lower for term in patterns):
1482
- doc_type = detected_type
1483
- break
1484
-
1485
- # Fast OCR text matching if still general type
1486
- if doc_type == "general" and ocr_text_sample:
1487
- ocr_lower = ocr_text_sample.lower()
1488
-
1489
- # Use the same patterns dictionary for consistency, but scan the OCR text
1490
- for detected_type, patterns in doc_type_patterns.items():
1491
- if any(term in ocr_lower for term in patterns):
1492
- doc_type = detected_type
1493
- break
1494
-
1495
- # Cache the result with improved LRU-like behavior
1496
- if len(StructuredOCR._doc_type_cache) >= StructuredOCR._doc_type_cache_size:
1497
- # Clear multiple entries at once for better performance
1498
- try:
1499
- # Remove up to 20 entries to avoid frequent cache clearing
1500
- for _ in range(20):
1501
- if StructuredOCR._doc_type_cache:
1502
- StructuredOCR._doc_type_cache.pop(next(iter(StructuredOCR._doc_type_cache)))
1503
- except:
1504
- # If concurrent modification causes issues, just proceed
1505
- pass
1506
-
1507
- # Store in cache
1508
- StructuredOCR._doc_type_cache[cache_key] = doc_type
1509
-
1510
- return doc_type
1511
-
1512
- def _detect_document_type(self, custom_prompt: Optional[str], ocr_text: str) -> str:
1513
- """
1514
- Detect document type based on content and custom prompt.
1515
-
1516
- Args:
1517
- custom_prompt: User-provided custom prompt
1518
- ocr_text: OCR-extracted text
1519
-
1520
- Returns:
1521
- Document type identifier ("handwritten", "printed", "letter", etc.)
1522
- """
1523
- # Only sample first 1000 characters of OCR text for faster processing while maintaining accuracy
1524
- ocr_sample = ocr_text[:1000] if ocr_text else ""
1525
-
1526
- # Use the cached version for better performance
1527
- return self._detect_document_type_cached(custom_prompt, ocr_sample)
1528
-
1529
- def _build_enhanced_prompt(self, doc_type: str, ocr_text: str, custom_prompt: Optional[str]) -> str:
1530
- """
1531
- Build an optimized prompt focused on OCR accuracy with specialized attention to
1532
- historical typography, manuscript conventions, and document deterioration patterns.
1533
 
1534
- Args:
1535
- doc_type: Detected document type
1536
- ocr_text: OCR-extracted text
1537
- custom_prompt: User-provided custom prompt
1538
-
1539
- Returns:
1540
- Optimized prompt focused on text extraction with historical document expertise
1541
- """
1542
- # Generic document section (included in all prompts)
1543
  generic_section = (
1544
- f"This is a document's OCR text:\n"
1545
- f"<BEGIN_OCR>\n{ocr_text}\n<END_OCR>\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
1546
  )
1547
 
1548
- # Check if custom prompt contains document type information
1549
- has_custom_doc_type = False
1550
- custom_doc_type = ""
1551
-
1552
- if custom_prompt and "DOCUMENT TYPE:" in custom_prompt:
1553
- # Extract the document type from the custom prompt
1554
- doc_type_line = custom_prompt.split("\n")[0]
1555
- if "DOCUMENT TYPE:" in doc_type_line:
1556
- custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip()
1557
- has_custom_doc_type = True
1558
- # If we have a custom doc type, use it instead
1559
- if custom_doc_type:
1560
- doc_type = custom_doc_type.lower()
1561
-
1562
- # If user has provided detailed instructions, provide more elaborate prompting
1563
- if custom_prompt and (has_custom_doc_type or len(custom_prompt.strip()) > 20):
1564
- # Enhanced prompt for documents with custom instructions and historical expertise
1565
- specific_section = (
1566
- f"You are an advanced OCR specialist with expertise in historical documents, typography, and manuscript conventions. "
1567
- f"Below is a document that requires specialized analysis with attention to historical characteristics. "
1568
- f"Pay particular attention to:\n"
1569
- f"- Historical typography features (long s 'ſ', ligatures, obsolete letter forms)\n"
1570
- f"- Manuscript conventions of the period (abbreviations, contractions, marginalia)\n"
1571
- f"- Document deterioration patterns (faded ink, foxing, water damage, paper degradation)\n"
1572
- f"- Accurately capturing ALL text content visible in the image with historical context\n"
1573
- f"- Following the specific user instructions for processing this document type\n"
1574
- f"- Identifying key information, structure, and historical formatting conventions\n"
1575
- f"- Providing comprehensive analysis with attention to historical context\n"
1576
- )
1577
-
1578
- # Add specialized instructions based on document type
1579
- if doc_type == "newspaper":
1580
- specific_section += (
1581
- f"\nThis appears to be a newspaper or document with columns. "
1582
- f"Please read each column from top to bottom, then move to the next column. "
1583
- f"Extract all article titles, headings, bylines, and body text in the correct reading order. "
1584
- f"Pay special attention to section headers, page numbers, publication date, and newspaper name. "
1585
- f"For historical newspapers, be aware of period-specific typography such as the long s (ſ), "
1586
- f"unique ligatures (æ, œ, ct, st), and decorative fonts. Account for paper degradation around "
1587
- f"fold lines and edges. Recognize archaic abbreviations and typesetting conventions of the period.\n"
1588
- )
1589
- elif doc_type == "letter":
1590
- specific_section += (
1591
- f"\nThis appears to be a letter or correspondence. "
1592
- f"Pay special attention to the letterhead, date, greeting, body content, closing, and signature. "
1593
- f"Preserve the original formatting including paragraph breaks and indentation. "
1594
- f"Note any handwritten annotations or marginalia separately. "
1595
- f"For historical letters, carefully transcribe historical scripts and handwriting styles, "
1596
- f"noting unclear or damaged sections. Identify period-specific salutations, closings, and "
1597
- f"formalities. Watch for ink fading, bleeding, and seepage through pages. "
1598
- f"Recognize period-specific abbreviations (ye, yr, inst, ult, prox) and long s (ſ) in older printed correspondence.\n"
1599
- )
1600
- elif doc_type == "book":
1601
- specific_section += (
1602
- f"\nThis appears to be a book or publication page. "
1603
- f"Pay attention to chapter titles, headers, page numbers, footnotes, and main body text. "
1604
- f"Preserve paragraph structure and any special formatting. "
1605
- f"Note any images, tables, or figures that might be referenced in the text. "
1606
- f"For historical books, attend to period typography including the long s (ſ), ligatures (æ, œ, ct, ſt), "
1607
- f"archaic letter forms, and decorative initials/drop caps. Account for foxing (brown spotting), "
1608
- f"bleed-through from opposite pages, and binding damage. Recognize period-specific typographic "
1609
- f"conventions like catchwords, signatures, obsolete punctuation, and historical spelling variants "
1610
- f"(e.g., -ize/-ise, past tense 'd for -ed). Note bookplates, ownership marks, and marginalia.\n"
1611
- )
1612
- elif doc_type == "form":
1613
- specific_section += (
1614
- f"\nThis appears to be a form or legal document. "
1615
- f"Carefully extract all field labels and their corresponding values. "
1616
- f"Preserve the structure of form fields and sections. "
1617
- f"Pay special attention to signature lines, dates, and any official markings. "
1618
- f"For historical forms and legal documents, recognize period-specific legal terminology and "
1619
- f"formulaic phrases. Note seals, stamps, watermarks, and official emblems. Watch for faded ink "
1620
- f"in signatures and filled fields. Identify period handwriting styles in completed sections. "
1621
- f"Account for specialized legal abbreviations (e.g., SS., Esq., inst., wit.) and archaic "
1622
- f"measurement units. Note folding patterns and worn edges common in frequently handled legal documents.\n"
1623
- )
1624
- elif doc_type == "recipe":
1625
- specific_section += (
1626
- f"\nThis appears to be a recipe or food-related document. "
1627
- f"Extract the recipe title, ingredient list (with measurements), preparation steps, "
1628
- f"cooking times, serving information, and any notes or tips. "
1629
- f"Maintain the distinction between ingredients and preparation instructions. "
1630
- f"For historical recipes, attend to archaic measurements (gill, dram, peck, firkin), obsolete "
1631
- f"cooking terminology, and period-specific ingredients and their modern equivalents. Note handwritten "
1632
- f"annotations and personal modifications. Identify period-specific cooking methods and tools that "
1633
- f"might need explanation. Watch for liquid stains and food residue common on well-used recipe pages. "
1634
- f"Recognize unclear fractions and temperature instructions (e.g., 'slow oven', 'quick fire').\n"
1635
- )
1636
-
1637
- # Output instructions (enhanced for custom requests)
1638
- output_section = (
1639
- f"Create a detailed structured JSON response with the following fields:\n"
1640
- f"- file_name: The document's name\n"
1641
- f"- topics: An array of specific topics, themes, or subjects covered in the document\n"
1642
- f"- languages: An array of languages used in the document\n"
1643
- f"- ocr_contents: A comprehensive dictionary with the document's contents including:\n"
1644
- f" * title: The main title or heading\n"
1645
- f" * subtitle: Any subtitle or secondary heading (if present)\n"
1646
- f" * date: Publication or document date (if present)\n"
1647
- f" * author: Author or creator information (if present)\n"
1648
- f" * content: The main body content, properly formatted\n"
1649
- f" * additional sections as appropriate for this document type\n"
1650
- f" * raw_text: The complete OCR text\n"
1651
- )
1652
- else:
1653
- # Default processing with basic historical document awareness
1654
- specific_section = (
1655
- f"You are an OCR specialist with knowledge of historical documents and typography. "
1656
- f"Focus on accurately extracting text content with attention to historical features. "
1657
- f"Pay special attention to:\n"
1658
- f"- Accurately capturing ALL text content visible in the image\n"
1659
- f"- Maintaining the correct reading order and structure\n"
1660
- f"- Preserving paragraph breaks and text layout\n"
1661
- f"- Identifying the main document type, time period, and language\n"
1662
- f"- Recognizing historical typography features (long s 'ſ', ligatures, archaic characters)\n"
1663
- f"- Accounting for document deterioration (faded ink, stains, foxing, physical damage)\n"
1664
- )
1665
-
1666
- # Only add specialized instructions for newspapers with columns
1667
- if doc_type == "newspaper":
1668
- specific_section += (
1669
- f"\nThis appears to be a document with columns. "
1670
- f"Be sure to read each column from top to bottom, then move to the next column. "
1671
- f"Extract all article titles, headings, and body text.\n"
1672
- )
1673
-
1674
- # Simple output instructions for default cases
1675
- output_section = (
1676
- f"Create a structured JSON response with the following fields:\n"
1677
- f"- file_name: The document's name\n"
1678
- f"- topics: An array of topics covered in the document\n"
1679
- f"- languages: An array of languages used in the document\n"
1680
- f"- ocr_contents: A dictionary with the document's contents, with the focus on complete text extraction\n"
1681
- )
1682
-
1683
  # Add custom prompt if provided
1684
  custom_section = ""
1685
  if custom_prompt:
1686
- # Process custom prompt to extract just the instructions part if available
1687
- if "USER INSTRUCTIONS:" in custom_prompt:
1688
- instructions_part = custom_prompt.split("USER INSTRUCTIONS:")[1].strip()
1689
- custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
1690
- elif "INSTRUCTIONS:" in custom_prompt:
1691
- instructions_part = custom_prompt.split("INSTRUCTIONS:")[1].strip()
1692
- custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
1693
- else:
1694
- # Strip custom prompt to essentials
1695
- stripped_prompt = custom_prompt.replace("This is a", "").replace("It appears to be a", "")
1696
- custom_section = f"\n\nUser-provided instructions: {stripped_prompt}\n"
1697
 
1698
- # Combine all sections into complete prompt
1699
- return generic_section + specific_section + output_section + custom_section
1700
 
1701
  def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
1702
  """
1703
  Extract structured data using text-only model with detailed historical context prompting
1704
- and improved error handling
1705
  """
1706
  logger = logging.getLogger("text_processor")
1707
  start_time = time.time()
@@ -1710,10 +1566,68 @@ class StructuredOCR:
1710
  # Fast path: Skip for minimal OCR text
1711
  if not ocr_markdown or len(ocr_markdown.strip()) < 50:
1712
  logger.info("Minimal OCR text - returning basic result")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1713
  return {
1714
  "file_name": filename,
1715
  "topics": ["Document"],
1716
- "languages": ["English"],
1717
  "ocr_contents": {
1718
  "raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
1719
  },
@@ -1734,10 +1648,6 @@ class StructuredOCR:
1734
  "processing_method": "test_mode"
1735
  }
1736
 
1737
- # Detect document type and build enhanced prompt
1738
- doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
1739
- logger.info(f"Detected document type: {doc_type}")
1740
-
1741
  # If OCR text is very large, truncate it to avoid API limits
1742
  truncated_text = ocr_markdown
1743
  if len(ocr_markdown) > 25000:
@@ -1745,8 +1655,25 @@ class StructuredOCR:
1745
  truncated_text = ocr_markdown[:15000] + "\n...[content truncated]...\n" + ocr_markdown[-5000:]
1746
  logger.info(f"OCR text truncated from {len(ocr_markdown)} to {len(truncated_text)} chars")
1747
 
1748
- # Build the prompt with truncated text if needed
1749
- enhanced_prompt = self._build_enhanced_prompt(doc_type, truncated_text, custom_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1750
 
1751
  # Use enhanced prompt with text-only model - with retry logic
1752
  max_retries = 2
@@ -1784,40 +1711,14 @@ class StructuredOCR:
1784
  if 'languages' in result:
1785
  result['languages'] = [str(lang) for lang in result.get('languages', [])]
1786
 
1787
- # Add processing metadata
1788
  result['processing_method'] = 'text_model'
1789
- result['document_type'] = doc_type
1790
  result['model_used'] = TEXT_MODEL
1791
  result['processing_time'] = time.time() - start_time
1792
 
1793
  # Flag when custom prompt has been successfully applied
1794
  if custom_prompt:
1795
  result['custom_prompt_applied'] = 'text_model'
1796
-
1797
- # Attempt to detect document type from custom prompt
1798
- if "DOCUMENT TYPE:" in custom_prompt:
1799
- doc_type_line = custom_prompt.split("\n")[0]
1800
- if "DOCUMENT TYPE:" in doc_type_line:
1801
- custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
1802
- result['detected_document_type'] = custom_doc_type
1803
- # Keyword-based detection as fallback
1804
- elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
1805
- result['detected_document_type'] = "newspaper"
1806
- elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
1807
- result['detected_document_type'] = "letter"
1808
- elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
1809
- result['detected_document_type'] = "book"
1810
- elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
1811
- result['detected_document_type'] = "form"
1812
- elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
1813
- result['detected_document_type'] = "recipe"
1814
- elif "this is a" in custom_prompt.lower():
1815
- # Extract document type from "This is a [type]" format
1816
- this_is_parts = custom_prompt.lower().split("this is a ")
1817
- if len(this_is_parts) > 1:
1818
- extracted_type = this_is_parts[1].split(".")[0].strip()
1819
- if extracted_type:
1820
- result['detected_document_type'] = extracted_type
1821
 
1822
  # Add raw text for reference if not already present
1823
  if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
@@ -1880,18 +1781,7 @@ class StructuredOCR:
1880
  "processing_time": time.time() - start_time
1881
  }
1882
 
1883
- # Try to extract some basic metadata even without AI
1884
- if ocr_markdown:
1885
- # Simple content analysis
1886
- text_sample = ocr_markdown[:5000].lower()
1887
-
1888
- # Try to detect language
1889
- if "dear" in text_sample and any(word in text_sample for word in ["sincerely", "regards", "truly"]):
1890
- result["topics"].append("Letter")
1891
- elif any(word in text_sample for word in ["recipe", "ingredients", "instructions", "cook", "bake"]):
1892
- result["topics"].append("Recipe")
1893
- elif any(word in text_sample for word in ["article", "report", "study", "analysis"]):
1894
- result["topics"].append("Article")
1895
 
1896
  except Exception as inner_e:
1897
  logger.error(f"Error creating basic result: {str(inner_e)}")
@@ -1919,4 +1809,4 @@ if __name__ == "__main__":
1919
  processor = StructuredOCR()
1920
  result = processor.process_file(file_path)
1921
 
1922
- print(json.dumps(result, indent=2))
 
1
+ # Standard library imports
2
  import os
3
  import sys
4
  import time
5
  import random
 
 
6
  import json
7
  import base64
8
  import logging
9
+ from enum import Enum
10
+ from pathlib import Path
11
  from functools import lru_cache
12
  from typing import Optional, Dict, Any, List, Union, Tuple
13
 
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Third-party imports
20
+ from pydantic import BaseModel
21
+
22
  # Try to import pycountry, provide fallback if not available
23
  try:
24
  import pycountry
25
  PYCOUNTRY_AVAILABLE = True
26
  except ImportError:
27
  PYCOUNTRY_AVAILABLE = False
28
+ logger.warning("pycountry module not available - using language code fallback")
 
 
29
 
30
  # Try to import Mistral AI, provide fallback if not available
31
  try:
 
35
  MISTRAL_AVAILABLE = True
36
  except ImportError:
37
  MISTRAL_AVAILABLE = False
38
+ logger.warning("mistralai module not available - OCR functionality will be limited")
 
 
 
 
39
 
40
  # Import utilities for OCR processing
41
  try:
 
219
  if file_type is None:
220
  suffix = file_path.suffix.lower()
221
  file_type = "pdf" if suffix == ".pdf" else "image"
222
+
223
+ # Check for handwritten document by filename
224
+ filename_lower = file_path.name.lower()
225
+ if "handwritten" in filename_lower or "manuscript" in filename_lower or "letter" in filename_lower:
226
+ logger.info(f"Detected likely handwritten document from filename: {file_path.name}")
227
+ # This will be used during processing to apply handwritten-specific handling
228
 
229
  # Get file size if not provided
230
  if file_size_mb is None and file_path.exists():
 
446
  # Convert only the selected pages to minimize memory usage
447
  selected_images = []
448
  combined_text = []
449
+ detected_languages = set() # Track detected languages across all pages
450
 
451
  # Process pages in larger batches for better efficiency
452
  batch_size = 5 # Process 5 pages at a time for better throughput
 
482
  # Add page text to combined text without obvious page markers
483
  page_text = page_result['ocr_contents']['raw_text']
484
  combined_text.append(f"{page_text}")
485
+
486
+ # Collect detected languages from each page
487
+ if 'languages' in page_result:
488
+ for lang in page_result['languages']:
489
+ detected_languages.add(lang)
490
  except Exception as page_e:
491
  logger.warning(f"Error processing page {page_num}: {str(page_e)}")
492
  # Clean up temp file
 
524
  # Add flag to indicate custom prompt was applied
525
  result['custom_prompt_applied'] = 'text_only'
526
 
527
+ # Simplified approach - no document type detection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
 
529
  except Exception as e:
530
  logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
 
538
  if 'ocr_contents' in result:
539
  result['ocr_contents']['raw_text'] = all_text
540
 
541
+ # Merge detected languages if available
542
+ if detected_languages:
543
+ result['languages'] = list(detected_languages)
544
+
545
  # Add PDF metadata
546
  result['file_name'] = file_path.name
547
  result['pdf_processing_method'] = 'pdf2image_optimized'
 
709
  limited_pages = True
710
  logger.info(f"Processing {len(pages_to_process)} pages out of {total_pages} total")
711
 
712
+ # Directly extract any language information from the OCR response
713
+ detected_languages = set()
714
+
715
+ # Check if the response has a 'languages' attribute in any form
716
+ # First check direct attributes on the response object
717
+ if hasattr(pdf_response, 'languages') and pdf_response.languages:
718
+ for lang in pdf_response.languages:
719
+ detected_languages.add(str(lang))
720
+ logger.info(f"Found language in OCR response: {lang}")
721
+
722
+ # Then check if it's in the response as a dictionary format
723
+ elif hasattr(pdf_response, '__dict__'):
724
+ response_dict = pdf_response.__dict__
725
+ if 'languages' in response_dict and response_dict['languages']:
726
+ for lang in response_dict['languages']:
727
+ detected_languages.add(str(lang))
728
+ logger.info(f"Found language in OCR response dict: {lang}")
729
+
730
  # Calculate confidence score if available
731
  try:
732
  confidence_values = [page.confidence for page in pages_to_process if hasattr(page, 'confidence')]
 
749
  if page_markdown.strip():
750
  all_markdown.append(f"{page_markdown}")
751
 
752
+ # Collect language information from individual pages if available
753
+ if hasattr(page, 'languages') and page.languages:
754
+ for lang in page.languages:
755
+ detected_languages.add(str(lang))
756
+ logger.info(f"Found language in page {page_num}: {lang}")
757
+
758
  # Join all pages with separation
759
  combined_markdown = "\n\n".join(all_markdown)
760
 
 
788
  combined_markdown, file_path.name, custom_prompt
789
  )
790
 
791
+ # If we have detected languages directly from the OCR model, use them
792
+ if detected_languages:
793
+ logger.info(f"Using languages detected by OCR model: {', '.join(detected_languages)}")
794
+ result['languages'] = list(detected_languages)
795
+ # Add flag to indicate source of language detection
796
+ result['language_detection_source'] = 'mistral-ocr-latest'
797
+
798
  # Add metadata about pages
799
  if limited_pages:
800
  result['limited_pages'] = {
 
956
  "confidence_score": 0.0
957
  }
958
 
959
+ # Check if this is likely a newspaper or handwritten document by filename
960
  is_likely_newspaper = False
961
+ is_likely_handwritten = False
962
+
963
  newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
964
  "chronicle", "post", "tribune", "news", "press", "gender"]
965
+
966
+ handwritten_keywords = ["handwritten", "manuscript", "letter", "correspondence", "journal", "diary"]
967
 
968
+ # Check filename for document type indicators
969
  filename_lower = file_path.name.lower()
970
+
971
+ # First check for handwritten documents
972
+ for keyword in handwritten_keywords:
973
  if keyword in filename_lower:
974
+ is_likely_handwritten = True
975
+ logger.info(f"Likely handwritten document detected from filename: {file_path.name}")
976
+ # Add handwritten-specific processing hint to custom_prompt if not already present
977
  if custom_prompt:
978
+ if "handwritten" not in custom_prompt.lower():
979
+ custom_prompt = custom_prompt + " This appears to be a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations."
980
  else:
981
+ custom_prompt = "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations."
982
  break
983
+
984
+ # Then check for newspaper if not handwritten
985
+ if not is_likely_handwritten:
986
+ for keyword in newspaper_keywords:
987
+ if keyword in filename_lower:
988
+ is_likely_newspaper = True
989
+ logger.info(f"Likely newspaper document detected from filename: {file_path.name}")
990
+ # Add newspaper-specific processing hint to custom_prompt if not already present
991
+ if custom_prompt:
992
+ if "column" not in custom_prompt.lower() and "newspaper" not in custom_prompt.lower():
993
+ custom_prompt = custom_prompt + " This appears to be a newspaper or document with columns. Please extract all text content from each column."
994
+ else:
995
+ custom_prompt = "This appears to be a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
996
+ break
997
 
998
  try:
999
  # Check file size
 
1066
  logger.info(f"Resized image to {new_size_mb:.2f} MB")
1067
  except ImportError:
1068
  logger.warning("PIL not available for resizing. Using original image.")
1069
+ # Use enhanced encoder with proper MIME type detection
1070
+ from ocr_utils import encode_image_for_api
1071
+ base64_data_url = encode_image_for_api(file_path)
1072
  except Exception as e:
1073
  logger.warning(f"Image resize failed: {str(e)}. Using original image.")
1074
+ # Use enhanced encoder with proper MIME type detection
1075
+ from ocr_utils import encode_image_for_api
1076
+ base64_data_url = encode_image_for_api(file_path)
1077
  else:
1078
+ # For smaller images, use as-is with proper MIME type
1079
+ from ocr_utils import encode_image_for_api
1080
+ base64_data_url = encode_image_for_api(file_path)
1081
  except Exception as e:
1082
  # Fallback to original image if any preprocessing fails
1083
  logger.warning(f"Image preprocessing failed: {str(e)}. Using original image.")
1084
+ # Use enhanced encoder with proper MIME type detection
1085
+ from ocr_utils import encode_image_for_api
1086
+ base64_data_url = encode_image_for_api(file_path)
1087
 
1088
  # Process the image with OCR
1089
  logger.info(f"Processing image with OCR using {OCR_MODEL}")
 
1175
  # Get the OCR markdown from the first page
1176
  image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
1177
 
1178
+ # Check if the OCR response has images
1179
+ has_images = hasattr(image_response, 'pages') and image_response.pages and hasattr(image_response.pages[0], 'images') and image_response.pages[0].images
1180
+
1181
+ # Check for language information directly from the OCR model
1182
+ detected_languages = set()
1183
+
1184
+ # Check if the response has a 'languages' attribute in any form
1185
+ # First check direct attributes on the response object
1186
+ if hasattr(image_response, 'languages') and image_response.languages:
1187
+ for lang in image_response.languages:
1188
+ detected_languages.add(str(lang))
1189
+ logger.info(f"Found language in OCR response: {lang}")
1190
+
1191
+ # Then check if it's in the response as a dictionary format
1192
+ elif hasattr(image_response, '__dict__'):
1193
+ response_dict = image_response.__dict__
1194
+ if 'languages' in response_dict and response_dict['languages']:
1195
+ for lang in response_dict['languages']:
1196
+ detected_languages.add(str(lang))
1197
+ logger.info(f"Found language in OCR response dict: {lang}")
1198
+
1199
+ # Check for languages in individual pages
1200
+ if hasattr(image_response, 'pages') and image_response.pages:
1201
+ for page in image_response.pages:
1202
+ if hasattr(page, 'languages') and page.languages:
1203
+ for lang in page.languages:
1204
+ detected_languages.add(str(lang))
1205
+ logger.info(f"Found language in page: {lang}")
1206
+
1207
  # Optimize: Skip vision model step if ocr_markdown is very small or empty
1208
  # BUT make an exception for newspapers or if custom_prompt is provided
1209
+ # OR if the image has visual content worth preserving
1210
+ if (not is_likely_newspaper and not custom_prompt and not has_images) and (not image_ocr_markdown or len(image_ocr_markdown) < 50):
1211
+ logger.warning("OCR produced minimal text with no images. Returning basic result.")
1212
  return {
1213
  "file_name": file_path.name,
1214
  "topics": ["Document"],
 
1216
  "ocr_contents": {
1217
  "raw_text": image_ocr_markdown if image_ocr_markdown else "No text could be extracted from the image."
1218
  },
1219
+ "processing_note": "OCR produced minimal text content",
1220
+ # Include raw response data for images
1221
+ "raw_response_data": serialize_ocr_response(image_response)
1222
  }
1223
 
1224
  # For newspapers with little text in OCR, set a more explicit prompt
 
1228
  custom_prompt = "This is a newspaper or document with columns. The OCR may not have captured all text. Please examine the image carefully and extract ALL text content visible in the document, reading each column from top to bottom."
1229
  elif "extract all text" not in custom_prompt.lower():
1230
  custom_prompt += " Please examine the image carefully and extract ALL text content visible in the document."
1231
+
1232
+ # For images with minimal text but visual content, enhance the prompt
1233
+ elif has_images and (not image_ocr_markdown or len(image_ocr_markdown) < 100):
1234
+ logger.info("Document with images but minimal text detected. Using enhanced prompt for mixed media.")
1235
+ if not custom_prompt:
1236
+ custom_prompt = "This is a mixed media document with both text and important visual elements. Please carefully describe the image content and extract all visible text, preserving the relationship between text and visuals."
1237
+ elif "visual" not in custom_prompt.lower() and "image" not in custom_prompt.lower():
1238
+ custom_prompt += " The document contains important visual elements that should be described along with the text content."
1239
 
1240
  # Extract structured data using the appropriate model, with a single API call
1241
  if use_vision:
 
1245
  logger.info(f"Using text-only model: {TEXT_MODEL}")
1246
  result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name, custom_prompt)
1247
 
1248
+ # If we have detected languages directly from the OCR model, use them
1249
+ if detected_languages:
1250
+ logger.info(f"Using languages detected by OCR model: {', '.join(detected_languages)}")
1251
+ result['languages'] = list(detected_languages)
1252
+ # Add flag to indicate source of language detection
1253
+ result['language_detection_source'] = 'mistral-ocr-latest'
1254
+
1255
  # Store the serialized OCR response for image rendering (for compatibility with original version)
1256
  # Don't store raw_response directly as it's not JSON serializable
1257
  serialized_response = serialize_ocr_response(image_response)
 
1259
 
1260
  # Store key parts of the OCR response for image rendering
1261
  # With serialized format that can be stored in JSON
 
1262
  result['has_images'] = has_images
1263
 
1264
  if has_images:
 
1371
  logger.info("Test mode or no API key, using text-only processing")
1372
  return self._extract_structured_data_text_only(ocr_markdown, filename)
1373
 
 
 
 
 
1374
  # Use only the first part of OCR text to keep prompts small and processing fast
1375
  if len(ocr_markdown) > 1000:
1376
  truncated_ocr = ocr_markdown[:1000]
 
1378
  else:
1379
  truncated_ocr = ocr_markdown
1380
 
1381
+ # Build a comprehensive prompt with OCR text and detailed instructions for language detection and image handling
1382
+ enhanced_prompt = f"This is a document's OCR text:\n<BEGIN_OCR>\n{truncated_ocr}\n<END_OCR>\n\n"
1383
+
1384
+ # Add custom prompt if provided
1385
+ if custom_prompt:
1386
+ enhanced_prompt += f"User instructions: {custom_prompt}\n\n"
1387
+
1388
+ # Add comprehensive extraction instructions with language detection guidance
1389
+ enhanced_prompt += "Extract all text content accurately from this document, including any text visible in the image that may not have been captured by OCR.\n\n"
1390
+ enhanced_prompt += "IMPORTANT: Precisely identify and list ALL languages present in the document separately. Look closely for multiple languages that might appear together.\n"
1391
+ enhanced_prompt += "For language detection, examine these specific indicators:\n"
1392
+ enhanced_prompt += "- Portuguese: accents (ã, õ, á, é, ê, ó, ç), words like 'e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com'\n"
1393
+ enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con'\n"
1394
+ enhanced_prompt += "- French: accents (é, è, ê, à, ç), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du'\n"
1395
+ enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
1396
+ enhanced_prompt += "- Italian: accents (à, è, é, ì, ò, ù), words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
1397
+ enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
1398
+ enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n\n"
1399
+ enhanced_prompt += "If the document contains multiple columns or sections, process each section independently and then combine them logically.\n"
1400
+ enhanced_prompt += "Return ALL detected languages as separate entries in the languages array, never combine them."
1401
 
1402
  # Measure API call time for optimization feedback
1403
  start_time = time.time()
 
1406
  # Use a fixed, shorter timeout for single-page documents
1407
  timeout_ms = 45000 # 45 seconds is optimal for most single-page documents
1408
 
1409
+ logger.info(f"Calling vision model with {timeout_ms}ms timeout")
1410
  chat_response = self.client.chat.parse(
1411
  model=VISION_MODEL,
1412
  messages=[
 
1424
  )
1425
 
1426
  api_time = time.time() - start_time
1427
+ logger.info(f"Vision model completed in {api_time:.2f}s")
1428
 
1429
  except Exception as e:
1430
  # If there's an error with the enhanced prompt, try progressively simpler approaches
 
1504
  if 'languages' in result:
1505
  result['languages'] = [str(lang) for lang in result.get('languages', [])]
1506
 
1507
+ # Add simplified metadata about processing
1508
  result['processing_info'] = {
1509
  'method': 'vision_model',
 
1510
  'ocr_text_length': len(ocr_markdown),
1511
  'api_response_time': time.time() - start_time
1512
  }
1513
 
1514
+ # Note if custom prompt was applied
1515
  if custom_prompt:
1516
  result['custom_prompt_applied'] = 'vision_model'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1517
 
1518
  # Add confidence score if not present
1519
  if 'confidence_score' not in result:
 
1526
 
1527
  return result
1528
 
1529
+ # We've removed document type detection entirely for simplicity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1530
 
1531
+ # Create a prompt with enhanced language detection instructions
 
 
 
 
 
 
 
 
1532
  generic_section = (
1533
+ f"You are an OCR specialist processing historical documents. "
1534
+ f"Focus on accurately extracting text content while preserving structure and formatting. "
1535
+ f"Pay attention to any historical features and document characteristics.\n\n"
1536
+ f"IMPORTANT: Accurately identify the document's language(s). Look for language-specific characters, words, and phrases. "
1537
+ f"Specifically check for French (accents like é, è, ç, words like 'le', 'la', 'et', 'est'), German (umlauts, words like 'und', 'der', 'das'), "
1538
+ f"Latin, and other non-English languages. Carefully analyze the text before determining language.\n\n"
1539
+ f"Create a structured JSON response with the following fields:\n"
1540
+ f"- file_name: The document's name\n"
1541
+ f"- topics: An array of topics covered in the document\n"
1542
+ f"- languages: An array of languages used in the document (be precise and specific about language detection)\n"
1543
+ f"- ocr_contents: A comprehensive dictionary with the document's contents including:\n"
1544
+ f" * title: The main title or heading (if present)\n"
1545
+ f" * content: The main body content\n"
1546
+ f" * raw_text: The complete OCR text\n"
1547
  )
1548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1549
  # Add custom prompt if provided
1550
  custom_section = ""
1551
  if custom_prompt:
1552
+ custom_section = f"\n\nUser-provided instructions: {custom_prompt}\n"
 
 
 
 
 
 
 
 
 
 
1553
 
1554
+ # Return the enhanced prompt
1555
+ return generic_section + custom_section
1556
 
1557
  def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
1558
  """
1559
  Extract structured data using text-only model with detailed historical context prompting
1560
+ and improved error handling with enhanced language detection
1561
  """
1562
  logger = logging.getLogger("text_processor")
1563
  start_time = time.time()
 
1566
  # Fast path: Skip for minimal OCR text
1567
  if not ocr_markdown or len(ocr_markdown.strip()) < 50:
1568
  logger.info("Minimal OCR text - returning basic result")
1569
+
1570
+ # Attempt comprehensive language detection even for minimal text
1571
+ detected_languages = []
1572
+
1573
+ # Simple language detection based on character frequency
1574
+ if ocr_markdown and len(ocr_markdown.strip()) > 10:
1575
+ # Define indicators for all supported languages
1576
+ language_indicators = {
1577
+ "Portuguese": {
1578
+ "chars": ['ã', 'õ', 'á', 'é', 'ê', 'í', 'ó', 'ú', 'ç'],
1579
+ "words": ['e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com']
1580
+ },
1581
+ "Spanish": {
1582
+ "chars": ['ñ', 'á', 'é', 'í', 'ó', 'ú', '¿', '¡'],
1583
+ "words": ['el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con', 'del']
1584
+ },
1585
+ "French": {
1586
+ "chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û'],
1587
+ "words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une']
1588
+ },
1589
+ "German": {
1590
+ "chars": ['ä', 'ö', 'ü', 'ß'],
1591
+ "words": ['der', 'die', 'das', 'und', 'ist', 'von', 'mit', 'für', 'sich']
1592
+ },
1593
+ "Italian": {
1594
+ "chars": ['à', 'è', 'é', 'ì', 'ò', 'ù'],
1595
+ "words": ['il', 'la', 'e', 'di', 'che', 'per', 'con', 'sono', 'non']
1596
+ },
1597
+ "Latin": {
1598
+ "chars": [],
1599
+ "words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod']
1600
+ }
1601
+ }
1602
+
1603
+ words = ocr_markdown.lower().split()
1604
+
1605
+ # Check for indicators of each language
1606
+ for language, indicators in language_indicators.items():
1607
+ chars = indicators["chars"]
1608
+ lang_words = indicators["words"]
1609
+
1610
+ has_chars = any(char in ocr_markdown for char in chars) if chars else False
1611
+ word_count = sum(1 for word in words if word in lang_words)
1612
+
1613
+ # Add language if strong enough indicators are present
1614
+ if has_chars or word_count >= 2:
1615
+ detected_languages.append(language)
1616
+
1617
+ # Check for English separately
1618
+ english_words = ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it']
1619
+ english_count = sum(1 for word in words if word in english_words)
1620
+ if english_count >= 2:
1621
+ detected_languages.append("English")
1622
+
1623
+ # If no languages detected, default to English
1624
+ if not detected_languages:
1625
+ detected_languages = ["English"]
1626
+
1627
  return {
1628
  "file_name": filename,
1629
  "topics": ["Document"],
1630
+ "languages": detected_languages,
1631
  "ocr_contents": {
1632
  "raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
1633
  },
 
1648
  "processing_method": "test_mode"
1649
  }
1650
 
 
 
 
 
1651
  # If OCR text is very large, truncate it to avoid API limits
1652
  truncated_text = ocr_markdown
1653
  if len(ocr_markdown) > 25000:
 
1655
  truncated_text = ocr_markdown[:15000] + "\n...[content truncated]...\n" + ocr_markdown[-5000:]
1656
  logger.info(f"OCR text truncated from {len(ocr_markdown)} to {len(truncated_text)} chars")
1657
 
1658
+ # Build a prompt with enhanced language detection instructions
1659
+ enhanced_prompt = f"This is a document's OCR text:\n<BEGIN_OCR>\n{truncated_text}\n<END_OCR>\n\n"
1660
+
1661
+ # Add custom prompt if provided
1662
+ if custom_prompt:
1663
+ enhanced_prompt += f"User instructions: {custom_prompt}\n\n"
1664
+
1665
+ # Add thorough extraction instructions with enhanced language detection and metadata requirements
1666
+ enhanced_prompt += "Extract all text content accurately from this document. Return structured data with the document's contents.\n\n"
1667
+ enhanced_prompt += "IMPORTANT: Precisely identify and list ALL languages present in the document separately. Look closely for multiple languages that might appear together.\n"
1668
+ enhanced_prompt += "For language detection, examine these specific indicators:\n"
1669
+ enhanced_prompt += "- French: accents (é, è, ê, à, ç), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du'\n"
1670
+ enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
1671
+ enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en'\n"
1672
+ enhanced_prompt += "- Italian: words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
1673
+ enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
1674
+ enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n"
1675
+ enhanced_prompt += "Do NOT classify text as English unless you can positively confirm it contains specifically English words and phrases.\n\n"
1676
+ enhanced_prompt += "Return ALL detected languages as separate entries in the languages array. If multiple languages are present, list them ALL separately."
1677
 
1678
  # Use enhanced prompt with text-only model - with retry logic
1679
  max_retries = 2
 
1711
  if 'languages' in result:
1712
  result['languages'] = [str(lang) for lang in result.get('languages', [])]
1713
 
1714
+ # Add simplified processing metadata
1715
  result['processing_method'] = 'text_model'
 
1716
  result['model_used'] = TEXT_MODEL
1717
  result['processing_time'] = time.time() - start_time
1718
 
1719
  # Flag when custom prompt has been successfully applied
1720
  if custom_prompt:
1721
  result['custom_prompt_applied'] = 'text_model'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1722
 
1723
  # Add raw text for reference if not already present
1724
  if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
 
1781
  "processing_time": time.time() - start_time
1782
  }
1783
 
1784
+ # No topic detection to avoid issue with document misclassification
 
 
 
 
 
 
 
 
 
 
 
1785
 
1786
  except Exception as inner_e:
1787
  logger.error(f"Error creating basic result: {str(inner_e)}")
 
1809
  processor = StructuredOCR()
1810
  result = processor.process_file(file_path)
1811
 
1812
+ print(json.dumps(result, indent=2))
ui/layout.py CHANGED
@@ -1,217 +1,339 @@
1
  import streamlit as st
2
 
3
  def load_css():
4
- """Load custom CSS for the application"""
5
  st.markdown("""
6
  <style>
7
- /* Global styles */
8
- body {
9
- font-family: 'Source Sans Pro', sans-serif;
10
- color: #333;
11
  }
12
 
13
- /* Header styles */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  h1, h2, h3, h4, h5, h6 {
15
- font-family: 'Georgia', serif;
16
  font-weight: 600;
17
- color: #1E3A8A;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  }
19
 
20
- /* Processing status container */
21
- .processing-status-container {
22
- padding: 10px 15px;
23
- border-left: 4px solid #1E88E5;
24
- background-color: #E3F2FD;
25
- border-radius: 0 4px 4px 0;
26
- margin: 10px 0;
27
- font-size: 14px;
28
  }
29
 
30
- /* Previous results styling */
31
- .previous-results-container {
32
- margin-top: 20px;
33
  }
34
 
35
- .result-card {
36
- background-color: #f8f9fa;
37
- border-radius: 8px;
38
- padding: 15px;
39
- margin-bottom: 15px;
40
- border: 1px solid #e0e0e0;
41
- transition: all 0.2s ease;
42
  }
43
 
44
- .result-card:hover {
45
- box-shadow: 0 4px 8px rgba(0,0,0,0.1);
46
- border-color: #c0c0c0;
47
  }
48
 
49
- .result-header {
50
- display: flex;
51
- justify-content: space-between;
52
- margin-bottom: 10px;
 
 
 
 
53
  }
54
 
55
- .result-filename {
56
- font-weight: bold;
57
- font-size: 16px;
 
 
 
 
 
 
58
  }
59
 
60
- .result-date {
61
- color: #666;
62
- font-size: 14px;
 
 
63
  }
64
 
65
- .result-metadata {
66
- margin-top: 10px;
67
- font-size: 14px;
 
 
68
  }
69
 
70
- .result-tag {
 
 
 
 
 
71
  margin-bottom: 5px;
72
- color: #555;
73
  }
74
 
75
- .result-action-button {
76
- margin-top: 10px;
77
- text-align: right;
78
  }
79
 
80
- .selected-result-container {
81
- margin-top: 30px;
82
- padding: 20px;
83
- background-color: #f0f2f6;
84
- border-radius: 8px;
85
- border: 1px solid #d0d7de;
86
  }
87
 
88
- .selected-result-title {
89
- font-size: 18px;
90
- font-weight: bold;
91
- color: #1E3A8A;
 
 
 
 
 
 
92
  }
93
 
94
- /* Raw text editor styling */
 
 
 
 
 
95
  .stTextArea textarea {
96
- font-family: 'Courier New', monospace;
97
- font-size: 14px;
98
- line-height: 1.5;
 
99
  }
100
 
101
- /* Image and text side-by-side styling */
102
- .image-text-container {
103
- display: flex;
104
- gap: 20px;
105
- margin-bottom: 20px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  }
107
-
108
- .image-container {
109
- flex: 1;
 
 
 
 
110
  }
111
 
112
- .text-container {
113
- flex: 1;
 
 
 
 
 
 
 
114
  }
115
 
116
- /* Sidebar styling */
117
- .sidebar .stRadio > div {
118
- flex-direction: row;
119
  }
120
 
121
- .sidebar .stRadio label {
122
- margin-right: 10px;
 
123
  }
124
 
125
- /* Optimize spacing in sidebar */
126
  .sidebar .block-container {
127
- padding-top: 0;
128
  }
129
 
130
- .sidebar [data-testid="stVerticalBlock"] {
131
- gap: 0;
132
- }
133
-
134
- /* Button styling */
135
- .stButton > button {
136
- border-radius: 4px;
137
- font-weight: 600;
138
  }
139
 
140
- /* File uploader styling */
141
- .stFileUploader > section > div {
142
- min-height: 100px;
143
  }
144
 
145
- /* Reset vertical text in file uploader */
146
- .stFileUploader p,
147
- .stFileUploader span,
148
- .stFileUploader div p,
149
- .stFileUploader div span,
150
- .stFileUploader label p,
151
- .stFileUploader label span,
152
- .stFileUploader div[data-testid="stFileUploadDropzone"] p,
153
- .stFileUploader div[data-testid="stFileUploadDropzone"] span {
154
- writing-mode: horizontal-tb !important;
155
  }
156
 
157
- /* Metadata styling */
158
- .metadata-card {
159
- background-color: #f8f9fa;
160
- border-radius: 8px;
161
- padding: 15px;
162
- margin-bottom: 20px;
163
- border: 1px solid #e0e0e0;
164
  }
165
 
166
- /* Document content styling */
167
- .document-content {
168
- margin-top: 10px;
 
 
 
169
  }
170
 
171
- /* Tab styling */
172
- .stTabs [data-baseweb="tab-list"] {
173
- gap: 8px;
 
174
  }
175
 
176
- .stTabs [data-baseweb="tab"] {
177
- padding: 8px 16px;
178
- border-radius: 4px 4px 0 0;
 
 
179
  }
180
 
181
- /* Success message styling */
182
- .stSuccess {
183
- background-color: #D4EDDA;
184
- color: #155724;
185
- padding: 10px;
186
- border-radius: 4px;
187
- border-left: 5px solid #155724;
188
  }
189
 
190
- /* Error message styling */
191
- .stError {
192
- background-color: #F8D7DA;
193
- color: #721C24;
194
- padding: 10px;
195
- border-radius: 4px;
196
- border-left: 5px solid #721C24;
197
  }
198
 
199
- /* Info message styling */
200
- .stInfo {
201
- background-color: #D1ECF1;
202
- color: #0C5460;
203
- padding: 10px;
204
- border-radius: 4px;
205
- border-left: 5px solid #0C5460;
206
  }
207
 
208
- /* Warning message styling */
209
- .stWarning {
210
- background-color: #FFF3CD;
211
- color: #856404;
212
- padding: 10px;
213
- border-radius: 4px;
214
- border-left: 5px solid #856404;
215
  }
216
  </style>
217
  """, unsafe_allow_html=True)
 
1
  import streamlit as st
2
 
3
  def load_css():
4
+ """Load custom CSS for the application - inspired by mistral-ocr implementations"""
5
  st.markdown("""
6
  <style>
7
+ /* Global styles - clean, modern approach with consistent line height */
8
+ :root {
9
+ --standard-line-height: 1.5;
 
10
  }
11
 
12
+ body {
13
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
14
+ color: #111827;
15
+ line-height: var(--standard-line-height);
16
+ }
17
+
18
+ /* Remove all container backgrounds that cause the white box issue */
19
+ div[data-testid="stMarkdownContainer"],
20
+ div[data-testid="stText"],
21
+ div.stMarkdown,
22
+ .stText > div:first-child,
23
+ .element-container > div,
24
+ div[data-testid="column"] > div > div > div {
25
+ background-color: transparent !important;
26
+ box-shadow: none !important;
27
+ border: none !important;
28
+ border-radius: 0 !important;
29
+ padding: 0 !important;
30
+ margin: 0 !important;
31
+ }
32
+
33
+ /* Base text styling with standardized line height */
34
+ div[data-testid="stMarkdownContainer"] > p {
35
+ margin: 0 0 0.3rem 0 !important;
36
+ padding: 0 !important;
37
+ font-size: 0.95rem !important;
38
+ line-height: var(--standard-line-height) !important;
39
+ }
40
+
41
+ /* Move content to top of columns with minimal padding */
42
+ [data-testid="column"] {
43
+ align-items: flex-start !important;
44
+ padding: 0 0.5rem !important;
45
+ gap: 0.5rem !important;
46
+ }
47
+
48
+ /* Clean minimal heading styles with better line height */
49
  h1, h2, h3, h4, h5, h6 {
50
+ font-family: 'Inter', system-ui, sans-serif;
51
  font-weight: 600;
52
+ color: #111827;
53
+ margin: 0.4rem 0 0.2rem 0 !important;
54
+ padding: 0 !important;
55
+ background-color: transparent !important;
56
+ line-height: 1.3 !important; /* Slightly increased for headings but still compact */
57
+ }
58
+
59
+ /* Simple section headers with subtle styling */
60
+ .block-container [data-testid="column"] h4 {
61
+ font-size: 0.95rem !important;
62
+ font-weight: 600 !important;
63
+ color: #374151 !important;
64
+ border-bottom: 1px solid #e5e7eb;
65
+ padding-bottom: 0.15rem !important;
66
+ margin-bottom: 0.25rem !important;
67
+ }
68
+
69
+ /* Reduce whitespace between elements */
70
+ .element-container {
71
+ margin-bottom: 0.2rem !important;
72
+ }
73
+
74
+ /* OCR text container with improved contrast and styling */
75
+ .ocr-text-container {
76
+ font-family: 'Inter', system-ui, sans-serif;
77
+ font-size: 0.95rem;
78
+ line-height: var(--standard-line-height); /* Consistent line height */
79
+ color: #111827;
80
+ margin-bottom: 0.4rem;
81
+ max-height: 600px;
82
+ overflow-y: auto;
83
+ background-color: transparent;
84
+ padding: 6px 10px;
85
+ border-radius: 4px;
86
+ border: 1px solid #e2e8f0;
87
  }
88
 
89
+ /* Custom scrollbar styling */
90
+ .ocr-text-container::-webkit-scrollbar {
91
+ width: 6px;
92
+ height: 6px;
 
 
 
 
93
  }
94
 
95
+ .ocr-text-container::-webkit-scrollbar-track {
96
+ background: #f1f1f1;
97
+ border-radius: 3px;
98
  }
99
 
100
+ .ocr-text-container::-webkit-scrollbar-thumb {
101
+ background: #c1c1c1;
102
+ border-radius: 3px;
 
 
 
 
103
  }
104
 
105
+ .ocr-text-container::-webkit-scrollbar-thumb:hover {
106
+ background: #a0a0a0;
 
107
  }
108
 
109
+ /* Styling for all expanders/accordions */
110
+ .st-expander,
111
+ details.streamlit-expanderHeader {
112
+ border: 1px solid #e5e7eb !important;
113
+ border-radius: 4px !important;
114
+ box-shadow: none !important;
115
+ background-color: transparent !important;
116
+ margin-bottom: 6px !important;
117
  }
118
 
119
+ .st-expanderHeader,
120
+ summary.streamlit-expanderHeader {
121
+ font-size: 0.95rem !important;
122
+ font-weight: 600 !important;
123
+ color: #374151 !important;
124
+ padding: 0.4rem 0.6rem !important;
125
+ background-color: rgba(241, 245, 249, 0.5) !important;
126
+ border-bottom: 1px solid #e5e7eb !important;
127
+ border-radius: 3px 3px 0 0 !important;
128
  }
129
 
130
+ .st-expanderContent,
131
+ details[open] > div:nth-child(2) {
132
+ border-top: none !important;
133
+ padding: 0.4rem 0.6rem !important;
134
+ background-color: transparent !important;
135
  }
136
 
137
+ /* Set expander text to have good contrast */
138
+ .st-expanderContent p,
139
+ .st-expanderContent li,
140
+ .st-expanderContent span {
141
+ color: #1f2937 !important;
142
  }
143
 
144
+ /* Streamlined OCR image display */
145
+ .ocr-image-container {
146
+ border: 1px solid #e2e8f0;
147
+ border-radius: 4px;
148
+ padding: 0;
149
+ background-color: transparent;
150
  margin-bottom: 5px;
 
151
  }
152
 
153
+ .ocr-image-container img {
154
+ border-radius: 4px;
155
+ width: 100%;
156
  }
157
 
158
+ /* Subtle document sections */
159
+ .document-section {
160
+ margin-bottom: 0.4rem !important;
 
 
 
161
  }
162
 
163
+ /* Compact tag styling */
164
+ .subject-tag {
165
+ display: inline-block;
166
+ padding: 0.1rem 0.4rem;
167
+ border-radius: 3px;
168
+ font-size: 0.7rem;
169
+ margin: 0 0.2rem 0.2rem 0;
170
+ background-color: #f3f4f6;
171
+ color: #374151;
172
+ border: 1px solid #e5e7eb;
173
  }
174
 
175
+ .tag-time-period { color: #1e40af; background-color: #eff6ff; border-color: #bfdbfe; }
176
+ .tag-language { color: #065f46; background-color: #ecfdf5; border-color: #a7f3d0; }
177
+ .tag-document-type { color: #5b21b6; background-color: #f5f3ff; border-color: #ddd6fe; }
178
+ .tag-subject { color: #166534; background-color: #f0fdf4; border-color: #bbf7d0; }
179
+
180
+ /* Clean text area */
181
  .stTextArea textarea {
182
+ font-family: 'Roboto Mono', monospace;
183
+ font-size: 0.9rem;
184
+ line-height: var(--standard-line-height); /* Consistent line height */
185
+ padding: 0.5rem;
186
  }
187
 
188
+ /* Button styling - fixed for text overflow issues */
189
+ .stButton > button {
190
+ border-radius: 4px;
191
+ font-weight: 400;
192
+ line-height: var(--standard-line-height);
193
+ padding: 0.4rem 0.75rem !important;
194
+ margin: 0;
195
+ min-width: 150px !important; /* Increased minimum width */
196
+ white-space: normal !important; /* Allow text to wrap if needed */
197
+ overflow: visible !important; /* Ensure text doesn't get cut off */
198
+ height: auto !important; /* Allow height to adjust as needed */
199
+ text-overflow: clip !important; /* Don't clip text */
200
+ display: inline-block !important; /* Better content handling */
201
+ text-align: center !important; /* Center text */
202
+ }
203
+
204
+ /* Fix button text alignment and prevent truncation */
205
+ .stButton > button > div,
206
+ .stButton > button span,
207
+ .stButton > button p {
208
+ display: inline-block !important;
209
+ align-items: center;
210
+ white-space: normal !important;
211
+ overflow: visible !important;
212
+ width: auto !important;
213
+ text-overflow: clip !important;
214
+ word-wrap: normal !important;
215
+ }
216
+
217
+ /* Fix for all action buttons in the application */
218
+ [data-testid="stHorizontalBlock"] button,
219
+ button[key="close_document_btn"],
220
+ button[key="process_document_btn"],
221
+ button[key="load_sample_btn"],
222
+ button[key="view_btn"],
223
+ .stDownloadButton button,
224
+ button[key*="copy_btn"],
225
+ button[key*="download_btn"],
226
+ button[key*="view_"] {
227
+ width: auto !important;
228
+ min-width: 150px !important;
229
+ max-width: none !important;
230
+ display: inline-block !important;
231
+ white-space: normal !important;
232
+ overflow: visible !important;
233
+ text-align: center !important;
234
+ text-overflow: clip !important;
235
+ word-break: normal !important;
236
+ padding: 0.4rem 0.75rem !important;
237
  }
238
+
239
+ /* Ensure text doesn't wrap awkwardly for buttons */
240
+ button span p {
241
+ margin: 0 !important;
242
+ padding: 0 !important;
243
+ white-space: normal !important;
244
+ overflow: visible !important;
245
  }
246
 
247
+ /* Extra button container fixes for all button types */
248
+ .stButton, .stDownloadButton, [data-testid="stDownloadButton"] {
249
+ width: auto !important;
250
+ min-width: 150px !important;
251
+ overflow: visible !important;
252
+ display: block !important;
253
+ background-color: white;
254
+ border: 1px solid #ddd;
255
+ box-shadow: none !important;
256
  }
257
 
258
+ /* Ensure consistent spacing in widgets */
259
+ .row-widget {
260
+ padding: 0.15rem 0 !important;
261
  }
262
 
263
+ /* Fix spacing in expanders */
264
+ .stExpander > .streamlit-expanderContent > div {
265
+ padding-top: 0.15rem !important;
266
  }
267
 
268
+ /* Optimized sidebar */
269
  .sidebar .block-container {
270
+ padding-top: 0.6rem;
271
  }
272
 
273
+ .sidebar .stRadio > div {
274
+ flex-direction: row;
 
 
 
 
 
 
275
  }
276
 
277
+ .sidebar .stRadio label {
278
+ margin-right: 0.75rem;
279
+ font-size: 0.9rem;
280
  }
281
 
282
+ /* Clean alert styles */
283
+ .stSuccess, .stError, .stInfo, .stWarning {
284
+ border-radius: 4px;
285
+ padding: 0.3rem 0.6rem;
286
+ margin: 0.2rem 0;
 
 
 
 
 
287
  }
288
 
289
+ /* Fix any remaining spacing issues */
290
+ div.element-container > div > div {
291
+ margin: 0 !important;
292
+ line-height: var(--standard-line-height); /* Ensure consistent line height */
 
 
 
293
  }
294
 
295
+ /* Fix column layouts for button containers */
296
+ [data-testid="column"] > div:has(.stButton) {
297
+ display: flex;
298
+ justify-content: flex-start;
299
+ align-items: center;
300
+ min-height: 38px; /* Match standard button height */
301
  }
302
 
303
+ /* Fix for tabs being cut off at the top of the page */
304
+ /* Main container adjustments to avoid header overlap */
305
+ .main .block-container {
306
+ padding-top: 3rem !important; /* Increased top padding to make room for Streamlit header */
307
  }
308
 
309
+ [data-testid="stTabs"] {
310
+ width: 100%;
311
+ overflow-x: visible !important;
312
+ position: relative;
313
+ z-index: 1; /* Ensure tabs are on the right layer */
314
  }
315
 
316
+ [data-testid="stTabs"] > div:first-child {
317
+ padding-left: 0.5rem;
318
+ padding-right: 0.5rem;
319
+ overflow-x: visible !important;
 
 
 
320
  }
321
 
322
+ [data-testid="stTabs"] [role="tab"] {
323
+ padding: 0.5rem 1rem;
324
+ min-width: fit-content;
325
+ white-space: nowrap;
 
 
 
326
  }
327
 
328
+ [data-testid="stTabs"] [role="tablist"] {
329
+ overflow-x: visible !important;
330
+ flex-wrap: nowrap;
331
+ margin-top: 1rem; /* Add a bit more space at the top */
 
 
 
332
  }
333
 
334
+ /* Fix header overlap issues */
335
+ header[data-testid="stHeader"] {
336
+ z-index: 999 !important; /* Keep header on top */
 
 
 
 
337
  }
338
  </style>
339
  """, unsafe_allow_html=True)
ui_components.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  import os
3
  import io
4
  import base64
 
5
  from datetime import datetime
6
  from pathlib import Path
7
  import json
@@ -64,16 +65,16 @@ class ProgressReporter:
64
  def create_sidebar_options():
65
  """Create and return sidebar options"""
66
  with st.sidebar:
67
- st.title("OCR Settings")
68
 
69
  # Create a container for the sidebar options
70
  with st.container():
71
  # Model selection
72
- st.subheader("Model Selection")
73
  use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
74
 
75
  # Document type selection
76
- st.subheader("Document Type")
77
  doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
78
  help="Select the type of document you're processing for better results")
79
 
@@ -100,7 +101,7 @@ def create_sidebar_options():
100
  st.markdown("**Custom Processing Instructions**")
101
  custom_prompt = st.text_area("", value=custom_prompt,
102
  help="Customize the instructions for processing this document",
103
- height=100)
104
 
105
  # Image preprocessing options in an expandable section
106
  with st.expander("Image Preprocessing"):
@@ -131,8 +132,17 @@ def create_sidebar_options():
131
  help="Rotate image if needed")
132
 
133
  # Create preprocessing options dictionary
 
 
 
 
 
 
 
 
 
134
  preprocessing_options = {
135
- "document_type": "standard", # Use standard as default, removed duplicate option
136
  "grayscale": grayscale,
137
  "denoise": denoise,
138
  "contrast": contrast,
@@ -141,23 +151,15 @@ def create_sidebar_options():
141
 
142
  # PDF-specific options in an expandable section
143
  with st.expander("PDF Options"):
144
- pdf_dpi = st.slider("PDF Resolution (DPI)",
145
- min_value=MIN_PDF_DPI,
146
- max_value=MAX_PDF_DPI,
147
- value=DEFAULT_PDF_DPI,
148
- step=25,
149
- help="Higher DPI gives better quality but slower processing")
150
-
151
  max_pages = st.number_input("Maximum Pages to Process",
152
  min_value=1,
153
  max_value=20,
154
  value=DEFAULT_MAX_PAGES,
155
  help="Limit the number of pages to process (for multi-page PDFs)")
156
 
157
- pdf_rotation = st.radio("PDF Rotation", ROTATION_OPTIONS,
158
- horizontal=True,
159
- format_func=lambda x: f"{x}°",
160
- help="Rotate PDF pages if needed")
161
 
162
  # Create options dictionary
163
  options = {
@@ -175,28 +177,23 @@ def create_sidebar_options():
175
  def create_file_uploader():
176
  """Create and return a file uploader"""
177
  # Add app description
178
- favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
179
- favicon_base64 = get_base64_from_image(favicon_path)
180
- st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><img src="data:image/png;base64,{favicon_base64}" width="36" height="36" alt="Scroll Icon"/> <div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical Document OCR</h2></div></div>', unsafe_allow_html=True)
181
- st.markdown("<p style='font-size: 0.8em; color: #666; text-align: right;'>Made possible by Mistral AI</p>", unsafe_allow_html=True)
182
 
183
  # Add project framing
184
  st.markdown("""
185
- This tool is designed to assist scholars in historical research by extracting text from challenging documents.
186
- While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
187
- historical documents, particularly:
 
188
 
189
- - **Historical newspapers** with complex layouts and aged text
190
- - **Handwritten documents** from various time periods
191
- - **Photos of archival materials** that may be difficult to read
192
-
193
- Upload a document to get started, or explore the example documents.
194
  """)
195
 
196
- # Create file uploader
197
  uploaded_file = st.file_uploader(
198
- "Upload a document",
199
- type=["pdf", "png", "jpg", "jpeg"],
200
  help="Upload a PDF or image file for OCR processing"
201
  )
202
  return uploaded_file
@@ -204,136 +201,407 @@ def create_file_uploader():
204
  def display_results(result, container, custom_prompt=""):
205
  """Display OCR results in the provided container"""
206
  with container:
207
- # Display document metadata
208
- st.subheader("Document Metadata")
 
 
209
 
210
- # Create columns for metadata
211
- meta_col1, meta_col2 = st.columns(2)
 
212
 
213
- with meta_col1:
214
- # Display document type and languages
215
- if 'detected_document_type' in result:
216
- st.write(f"**Document Type:** {result['detected_document_type']}")
 
 
 
217
 
218
- if 'languages' in result:
219
- languages = [lang for lang in result['languages'] if lang is not None]
220
- if languages:
221
- st.write(f"**Languages:** {', '.join(languages)}")
222
 
223
- with meta_col2:
224
- # Display processing time
225
- if 'processing_time' in result:
226
- st.write(f"**Processing Time:** {result['processing_time']:.1f}s")
227
-
228
- # Display page information for PDFs
229
- if 'limited_pages' in result:
230
- st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
- # Display subject tags if available
233
  if 'topics' in result and result['topics']:
234
- st.write("**Subject Tags:**")
235
- # Create a container with flex display for the tags
236
- st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
237
-
238
- # Generate a badge for each tag
239
- for topic in result['topics']:
240
- # Create colored badge based on tag category
241
- badge_color = "#546e7a" # Default color
 
242
 
243
- # Assign colors by category
244
- if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
245
- badge_color = "#1565c0" # Blue for time periods
246
- elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
247
- badge_color = "#00695c" # Teal for languages
248
- elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
249
- badge_color = "#6a1b9a" # Purple for document types
250
- elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
251
- badge_color = "#2e7d32" # Green for subject domains
252
- elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
253
- badge_color = "#e65100" # Orange for preprocessing-related tags
254
 
255
- st.markdown(
256
- f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
257
- f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
258
- unsafe_allow_html=True
259
- )
260
-
261
- # Close the container
262
- st.markdown('</div>', unsafe_allow_html=True)
 
 
 
 
 
 
 
 
263
 
264
- # Display OCR content
265
- st.subheader("OCR Content")
266
 
267
- # Check if we have OCR content
268
- if 'ocr_contents' in result:
269
- # Create tabs for different views
270
- has_images = result.get('has_images', False)
271
- if has_images:
272
- content_tab1, content_tab2, content_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
273
- else:
274
- content_tab1, content_tab2 = st.tabs(["Structured View", "Raw Text"])
 
 
 
275
 
 
276
  with content_tab1:
277
- # Display structured content
278
  if isinstance(result['ocr_contents'], dict):
279
- for section, content in result['ocr_contents'].items():
280
- if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections
281
- st.markdown(f"#### {section.replace('_', ' ').title()}")
 
 
 
 
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  if isinstance(content, str):
284
- st.write(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  elif isinstance(content, list):
 
 
286
  for item in content:
287
  if isinstance(item, str):
288
- st.write(f"- {item}")
 
 
 
289
  else:
290
- st.write(f"- {str(item)}")
 
 
 
 
 
 
 
 
291
  elif isinstance(content, dict):
 
 
292
  for k, v in content.items():
293
- st.write(f"**{k}:** {v}")
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
- with content_tab2:
296
- # Display raw text with editing capability
297
- raw_text = ""
298
- if 'raw_text' in result['ocr_contents']:
299
- raw_text = result['ocr_contents']['raw_text']
300
- elif 'content' in result['ocr_contents']:
301
- raw_text = result['ocr_contents']['content']
302
-
303
- # Allow editing of the raw text
304
- edited_text = st.text_area("Edit Raw Text", raw_text, height=400)
305
-
306
- # Add a button to copy the edited text to clipboard
307
- if st.button("Copy to Clipboard"):
308
- st.success("Text copied to clipboard! (You can paste it elsewhere)")
309
- # Note: The actual clipboard functionality is handled by the browser
310
-
311
- # Add a download button for the edited text
312
- st.download_button(
313
- label="Download Edited Text",
314
- data=edited_text,
315
- file_name=f"{result.get('file_name', 'document').split('.')[0]}_edited.txt",
316
- mime="text/plain"
317
- )
318
 
319
- if has_images and 'pages_data' in result:
320
- with content_tab3:
321
- # Use the display_document_with_images function
322
- display_document_with_images(result)
323
-
324
- # Display custom prompt if provided
325
- if custom_prompt:
326
- with st.expander("Custom Processing Instructions"):
327
- st.write(custom_prompt)
328
-
329
- # Add download buttons
330
- st.subheader("Download Results")
331
-
332
- # Create columns for download buttons
333
- download_col1, download_col2 = st.columns(2)
334
-
335
- with download_col1:
336
- # JSON download
337
  try:
338
  json_str = json.dumps(result, indent=2)
339
  st.download_button(
@@ -344,8 +612,7 @@ def display_results(result, container, custom_prompt=""):
344
  )
345
  except Exception as e:
346
  st.error(f"Error creating JSON download: {str(e)}")
347
-
348
- with download_col2:
349
  # Text download
350
  try:
351
  if 'ocr_contents' in result:
@@ -369,314 +636,319 @@ def display_results(result, container, custom_prompt=""):
369
 
370
  def display_document_with_images(result):
371
  """Display document with images"""
372
- if 'pages_data' not in result:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  st.info("No image data available.")
374
  return
375
 
376
  # Display each page
377
- for i, page_data in enumerate(result['pages_data']):
378
  st.markdown(f"### Page {i+1}")
379
 
380
  # Create columns for image and text
381
  img_col, text_col = st.columns([1, 1])
382
 
383
  with img_col:
384
- # Display the image
 
 
 
385
  if 'image_data' in page_data:
386
  try:
387
  # Convert base64 to image
388
  image_data = base64.b64decode(page_data['image_data'])
389
  st.image(io.BytesIO(image_data), use_container_width=True)
 
390
  except Exception as e:
391
- st.error(f"Error displaying image: {str(e)}")
392
- else:
 
 
 
 
 
 
 
 
 
 
 
 
393
  st.info("No image available for this page.")
394
 
395
  with text_col:
396
- # Display the text with editing capability
 
397
  if 'text' in page_data:
398
- edited_text = st.text_area(f"Page {i+1} Text", page_data['text'], height=300, key=f"page_text_{i}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
- # Add a button to copy the edited text to clipboard
401
- if st.button(f"Copy Page {i+1} Text", key=f"copy_btn_{i}"):
402
- st.success(f"Page {i+1} text copied to clipboard!")
403
  else:
404
  st.info("No text available for this page.")
405
 
406
  def display_previous_results():
407
- """Display previous results tab content"""
408
- st.markdown('<h2>Previous Results</h2>', unsafe_allow_html=True)
409
 
410
- # Load custom CSS for Previous Results tab
411
- try:
412
- from ui.layout import load_css
413
- load_css()
414
- except ImportError:
415
- # If ui.layout module is not available, use a simplified version
416
- st.markdown("""
417
- <style>
418
- .previous-results-container {
419
- margin-top: 20px;
420
- }
421
- .result-card {
422
- background-color: #f8f9fa;
423
- border-radius: 8px;
424
- padding: 15px;
425
- margin-bottom: 15px;
426
- border: 1px solid #e0e0e0;
427
- }
428
- .result-header {
429
- display: flex;
430
- justify-content: space-between;
431
- margin-bottom: 10px;
432
- }
433
- .result-filename {
434
- font-weight: bold;
435
- font-size: 16px;
436
- }
437
- .result-date {
438
- color: #666;
439
- font-size: 14px;
440
- }
441
- .result-metadata {
442
- margin-top: 10px;
443
- font-size: 14px;
444
- }
445
- .result-tag {
446
- margin-bottom: 5px;
447
- color: #555;
448
- }
449
- .result-action-button {
450
- margin-top: 10px;
451
- text-align: right;
452
- }
453
- .selected-result-container {
454
- margin-top: 30px;
455
- padding: 20px;
456
- background-color: #f0f2f6;
457
- border-radius: 8px;
458
- }
459
- .selected-result-title {
460
- font-size: 18px;
461
- font-weight: bold;
462
- }
463
- </style>
464
- """, unsafe_allow_html=True)
465
 
466
  # Display previous results if available
467
  if not st.session_state.previous_results:
468
  st.markdown("""
469
- <div class="previous-results-container" style="text-align: center; padding: 40px 20px; background-color: #f0f2f6; border-radius: 8px;">
470
- <div style="font-size: 48px; margin-bottom: 20px;">📄</div>
471
- <h3 style="margin-bottom: 10px; font-weight: 600;">No Previous Results</h3>
472
- <p style="font-size: 16px;">Process a document to see your results history saved here.</p>
473
  </div>
474
  """, unsafe_allow_html=True)
475
  else:
476
- # Create a container for the results list
477
- st.markdown('<div class="previous-results-container">', unsafe_allow_html=True)
478
- st.markdown(f'<h3>{len(st.session_state.previous_results)} Previous Results</h3>', unsafe_allow_html=True)
479
-
480
- # Create two columns for filters and download buttons
481
- filter_col, download_col = st.columns([2, 1])
482
-
483
- with filter_col:
484
- # Add filter options
485
- filter_options = ["All Types"]
486
- if any(result.get("file_name", "").lower().endswith(".pdf") for result in st.session_state.previous_results):
487
- filter_options.append("PDF Documents")
488
- if any(result.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png")) for result in st.session_state.previous_results):
489
- filter_options.append("Images")
490
 
491
- selected_filter = st.selectbox("Filter by Type:", filter_options)
 
 
 
 
 
 
 
 
 
 
 
 
492
 
493
- with download_col:
494
- # Add download all button for results
495
- if len(st.session_state.previous_results) > 0:
496
- try:
497
- # Create buffer in memory instead of file on disk
498
- import io
499
- from ocr_utils import create_results_zip_in_memory
500
-
501
- # Get zip data directly in memory
502
- zip_data = create_results_zip_in_memory(st.session_state.previous_results)
503
-
504
- # Create more informative ZIP filename with timestamp
505
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
506
 
507
- # Count document types for a more descriptive filename
508
- pdf_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith('.pdf'))
509
- img_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
510
 
511
- # Create more descriptive filename
512
- if pdf_count > 0 and img_count > 0:
513
- zip_filename = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
514
- elif pdf_count > 0:
515
- zip_filename = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
516
- elif img_count > 0:
517
- zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip"
518
  else:
519
- zip_filename = f"historical_ocr_results_{timestamp}.zip"
520
 
521
- st.download_button(
522
- label="Download All Results",
523
- data=zip_data,
524
- file_name=zip_filename,
525
- mime="application/zip",
526
- help="Download all previous results as a ZIP file containing HTML and JSON files"
527
- )
528
- except Exception as e:
529
- st.error(f"Error creating download: {str(e)}")
530
- st.info("Try with fewer results or individual downloads")
531
-
532
- # Filter results based on selection
533
- filtered_results = st.session_state.previous_results
534
- if selected_filter == "PDF Documents":
535
- filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith(".pdf")]
536
- elif selected_filter == "Images":
537
- filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png"))]
538
-
539
- # Show a message if no results match the filter
540
- if not filtered_results:
541
- st.markdown("""
542
- <div style="text-align: center; padding: 20px; background-color: #f9f9f9; border-radius: 5px; margin: 20px 0;">
543
- <p>No results match the selected filter.</p>
544
- </div>
545
- """, unsafe_allow_html=True)
546
-
547
- # Display each result as a card
548
- for i, result in enumerate(filtered_results):
549
- # Determine file type icon
550
- file_name = result.get("file_name", f"Document {i+1}")
551
- file_type_lower = file_name.lower()
552
-
553
- if file_type_lower.endswith(".pdf"):
554
- icon = "📄"
555
- elif file_type_lower.endswith((".jpg", ".jpeg", ".png", ".gif")):
556
- icon = "🖼️"
557
- else:
558
- icon = "📝"
559
-
560
- # Create a card for each result
561
- st.markdown(f"""
562
- <div class="result-card">
563
- <div class="result-header">
564
- <div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div>
565
- <div class="result-date">{result.get('timestamp', 'Unknown')}</div>
566
- </div>
567
- <div class="result-metadata">
568
- <div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
569
- <div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div>
570
- </div>
571
- """, unsafe_allow_html=True)
572
-
573
- # Add view button inside the card with proper styling
574
- st.markdown('<div class="result-action-button">', unsafe_allow_html=True)
575
- if st.button(f"View Document", key=f"view_{i}"):
576
- # Set the selected result in the session state
577
- st.session_state.selected_previous_result = st.session_state.previous_results[i]
578
- # Force a rerun to show the selected result
579
- st.rerun()
580
- st.markdown('</div>', unsafe_allow_html=True)
581
-
582
- # Close the result card
583
- st.markdown('</div>', unsafe_allow_html=True)
584
-
585
- # Close the container
586
- st.markdown('</div>', unsafe_allow_html=True)
587
 
588
  # Display the selected result if available
589
  if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
590
  selected_result = st.session_state.selected_previous_result
591
 
592
- # Create a styled container for the selected result
593
- st.markdown(f"""
594
- <div class="selected-result-container">
595
- <div class="result-header" style="margin-bottom: 20px;">
596
- <div class="selected-result-title">Selected Document: {selected_result.get('file_name', 'Unknown')}</div>
597
- <div class="result-date">{selected_result.get('timestamp', '')}</div>
598
- </div>
599
- """, unsafe_allow_html=True)
600
-
601
- # Display metadata in a styled way
602
- meta_col1, meta_col2 = st.columns(2)
603
-
604
- with meta_col1:
605
- # Display document metadata
606
- if 'languages' in selected_result:
607
- languages = [lang for lang in selected_result['languages'] if lang is not None]
608
- if languages:
609
- st.write(f"**Languages:** {', '.join(languages)}")
 
 
610
 
611
- if 'topics' in selected_result and selected_result['topics']:
612
- # Show topics in a more organized way with badges
613
- st.markdown("**Subject Tags:**")
614
- # Create a container with flex display for the tags
615
- st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
616
-
617
- # Generate a badge for each tag
618
- for topic in selected_result['topics']:
619
- # Create colored badge based on tag category
620
- badge_color = "#546e7a" # Default color
621
-
622
- # Assign colors by category
623
- if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
624
- badge_color = "#1565c0" # Blue for time periods
625
- elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
626
- badge_color = "#00695c" # Teal for languages
627
- elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
628
- badge_color = "#6a1b9a" # Purple for document types
629
- elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
630
- badge_color = "#2e7d32" # Green for subject domains
631
- elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
632
- badge_color = "#e65100" # Orange for preprocessing-related tags
633
-
634
- st.markdown(
635
- f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
636
- f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
637
- unsafe_allow_html=True
638
- )
639
-
640
- # Close the container
641
- st.markdown('</div>', unsafe_allow_html=True)
642
 
643
- with meta_col2:
644
- # Display processing metadata
645
- if 'limited_pages' in selected_result:
646
- st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages")
647
 
648
- if 'processing_time' in selected_result:
649
- proc_time = selected_result['processing_time']
650
- st.write(f"**Processing Time:** {proc_time:.1f}s")
651
 
652
- # Create tabs for content display
653
  has_images = selected_result.get('has_images', False)
654
  if has_images:
655
- view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
 
656
  else:
657
- view_tab1, view_tab2 = st.tabs(["Structured View", "Raw Text"])
 
658
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
  with view_tab1:
660
- # Display structured content
661
  if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  for section, content in selected_result['ocr_contents'].items():
663
- if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections
664
- st.markdown(f"#### {section.replace('_', ' ').title()}")
 
 
665
 
666
  if isinstance(content, str):
667
- st.write(content)
668
  elif isinstance(content, list):
669
  for item in content:
670
- if isinstance(item, str):
671
- st.write(f"- {item}")
672
- else:
673
- st.write(f"- {str(item)}")
674
  elif isinstance(content, dict):
675
  for k, v in content.items():
676
- st.write(f"**{k}:** {v}")
677
 
 
678
  with view_tab2:
679
- # Display raw text with editing capability
680
  raw_text = ""
681
  if 'ocr_contents' in selected_result:
682
  if 'raw_text' in selected_result['ocr_contents']:
@@ -684,74 +956,91 @@ def display_previous_results():
684
  elif 'content' in selected_result['ocr_contents']:
685
  raw_text = selected_result['ocr_contents']['content']
686
 
687
- # Allow editing of the raw text
688
- edited_text = st.text_area("Edit Raw Text", raw_text, height=400, key="selected_raw_text")
689
-
690
- # Add a button to copy the edited text to clipboard
691
- if st.button("Copy to Clipboard", key="selected_copy_btn"):
692
- st.success("Text copied to clipboard! (You can paste it elsewhere)")
693
 
694
- # Add a download button for the edited text
695
- st.download_button(
696
- label="Download Edited Text",
697
- data=edited_text,
698
- file_name=f"{selected_result.get('file_name', 'document').split('.')[0]}_edited.txt",
699
- mime="text/plain",
700
- key="selected_download_btn"
701
- )
 
 
 
 
702
 
 
703
  if has_images and 'pages_data' in selected_result:
704
  with view_tab3:
705
- # Use the display_document_with_images function
706
- display_document_with_images(selected_result)
707
-
708
- # Close the container
709
- st.markdown('</div>', unsafe_allow_html=True)
710
-
711
- # Add a button to close the selected result
712
- if st.button("Close Selected Document", key="close_selected"):
713
- # Clear the selected result from session state
714
- del st.session_state.selected_previous_result
715
- # Force a rerun to update the view
716
- st.rerun()
 
 
 
 
 
 
717
 
718
  def display_about_tab():
719
  """Display about tab content"""
720
- st.markdown('<h2>About Historical OCR</h2>', unsafe_allow_html=True)
721
 
722
  # Add app description
723
  st.markdown("""
724
  **Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
 
725
 
726
- ### Purpose
727
-
 
728
  This tool is designed to assist scholars in historical research by extracting text from challenging documents.
729
  While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
730
  historical documents, particularly:
 
731
 
 
732
  - **Historical newspapers** with complex layouts and aged text
733
  - **Handwritten documents** from various time periods
734
  - **Photos of archival materials** that may be difficult to read
 
735
 
736
- ### Features
737
-
 
738
  - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
739
  - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
740
  - **Editable Results**: Review and edit extracted text directly in the interface
741
  - **Structured Content Analysis**: Automatic organization of document content
742
  - **Multi-language Support**: Process documents in various languages
743
  - **PDF Processing**: Handle multi-page historical documents
 
744
 
745
- ### How to Use
746
-
 
747
  1. Upload a document (PDF or image)
748
  2. Select the document type and adjust preprocessing options if needed
749
  3. Add custom processing instructions for specialized documents
750
  4. Process the document
751
  5. Review, edit, and download the results
 
752
 
753
- ### Technologies
754
-
 
755
  - OCR processing using Mistral AI's advanced document understanding capabilities
756
  - Image preprocessing with OpenCV
757
  - PDF handling with pdf2image
 
2
  import os
3
  import io
4
  import base64
5
+ import logging
6
  from datetime import datetime
7
  from pathlib import Path
8
  import json
 
65
  def create_sidebar_options():
66
  """Create and return sidebar options"""
67
  with st.sidebar:
68
+ st.markdown("## OCR Settings")
69
 
70
  # Create a container for the sidebar options
71
  with st.container():
72
  # Model selection
73
+ st.markdown("### Model Selection")
74
  use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
75
 
76
  # Document type selection
77
+ st.markdown("### Document Type")
78
  doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
79
  help="Select the type of document you're processing for better results")
80
 
 
101
  st.markdown("**Custom Processing Instructions**")
102
  custom_prompt = st.text_area("", value=custom_prompt,
103
  help="Customize the instructions for processing this document",
104
+ height=80)
105
 
106
  # Image preprocessing options in an expandable section
107
  with st.expander("Image Preprocessing"):
 
132
  help="Rotate image if needed")
133
 
134
  # Create preprocessing options dictionary
135
+ # Set document_type based on selection in UI
136
+ doc_type_for_preprocessing = "standard"
137
+ if "Handwritten" in doc_type:
138
+ doc_type_for_preprocessing = "handwritten"
139
+ elif "Newspaper" in doc_type or "Magazine" in doc_type:
140
+ doc_type_for_preprocessing = "newspaper"
141
+ elif "Book" in doc_type or "Publication" in doc_type:
142
+ doc_type_for_preprocessing = "printed"
143
+
144
  preprocessing_options = {
145
+ "document_type": doc_type_for_preprocessing,
146
  "grayscale": grayscale,
147
  "denoise": denoise,
148
  "contrast": contrast,
 
151
 
152
  # PDF-specific options in an expandable section
153
  with st.expander("PDF Options"):
 
 
 
 
 
 
 
154
  max_pages = st.number_input("Maximum Pages to Process",
155
  min_value=1,
156
  max_value=20,
157
  value=DEFAULT_MAX_PAGES,
158
  help="Limit the number of pages to process (for multi-page PDFs)")
159
 
160
+ # Set default values for removed options
161
+ pdf_dpi = DEFAULT_PDF_DPI
162
+ pdf_rotation = 0
 
163
 
164
  # Create options dictionary
165
  options = {
 
177
  def create_file_uploader():
178
  """Create and return a file uploader"""
179
  # Add app description
180
+ st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><div style="font-size: 32px;">📜</div><div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical OCR</h2></div></div>', unsafe_allow_html=True)
181
+ st.markdown("<p style='font-size: 0.8em; color: #666; text-align: left;'>Made possible by Mistral AI</p>", unsafe_allow_html=True)
 
 
182
 
183
  # Add project framing
184
  st.markdown("""
185
+ This tool assists scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy, it helps navigate:
186
+ - **Historical newspapers** with complex layouts
187
+ - **Handwritten documents** from various periods
188
+ - **Photos of archival materials**
189
 
190
+ Upload a document to begin, or explore the examples.
 
 
 
 
191
  """)
192
 
193
+ # Create file uploader with a more concise label
194
  uploaded_file = st.file_uploader(
195
+ "Select file",
196
+ type=["pdf", "png", "jpg"],
197
  help="Upload a PDF or image file for OCR processing"
198
  )
199
  return uploaded_file
 
201
  def display_results(result, container, custom_prompt=""):
202
  """Display OCR results in the provided container"""
203
  with container:
204
+ # No heading for document metadata - start directly with content
205
+
206
+ # Create a compact metadata section
207
+ meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">'
208
 
209
+ # Document type
210
+ if 'detected_document_type' in result:
211
+ meta_html += f'<div><strong>Type:</strong> {result["detected_document_type"]}</div>'
212
 
213
+ # Processing time
214
+ if 'processing_time' in result:
215
+ meta_html += f'<div><strong>Time:</strong> {result["processing_time"]:.1f}s</div>'
216
+
217
+ # Page information
218
+ if 'limited_pages' in result:
219
+ meta_html += f'<div><strong>Pages:</strong> {result["limited_pages"]["processed"]}/{result["limited_pages"]["total"]}</div>'
220
 
221
+ meta_html += '</div>'
222
+ st.markdown(meta_html, unsafe_allow_html=True)
223
+
224
+ # Language metadata on a separate line, Subject Tags below
225
 
226
+ # First show languages if available
227
+ if 'languages' in result and result['languages']:
228
+ languages = [lang for lang in result['languages'] if lang is not None]
229
+ if languages:
230
+ # Create a dedicated line for Languages
231
+ lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
232
+ lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>'
233
+
234
+ # Add language tags
235
+ for lang in languages:
236
+ # Clean language name if needed
237
+ clean_lang = str(lang).strip()
238
+ if clean_lang: # Only add if not empty
239
+ lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>'
240
+
241
+ lang_html += '</div>'
242
+ st.markdown(lang_html, unsafe_allow_html=True)
243
+
244
+ # Create a separate line for Time if we have time-related tags
245
+ if 'topics' in result and result['topics']:
246
+ time_tags = [topic for topic in result['topics']
247
+ if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"])]
248
+ if time_tags:
249
+ time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
250
+ time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
251
+ for tag in time_tags:
252
+ time_html += f'<span class="subject-tag tag-time-period">{tag}</span>'
253
+ time_html += '</div>'
254
+ st.markdown(time_html, unsafe_allow_html=True)
255
 
256
+ # Then display remaining subject tags if available
257
  if 'topics' in result and result['topics']:
258
+ # Filter out time-related tags which are already displayed
259
+ subject_tags = [topic for topic in result['topics']
260
+ if not any(term in topic.lower() for term in ["century", "pre-", "era", "historical"])]
261
+
262
+ if subject_tags:
263
+ # Create a separate line for Subject Tags
264
+ tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
265
+ tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>'
266
+ tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">'
267
 
268
+ # Generate a badge for each remaining tag
269
+ for topic in subject_tags:
270
+ # Determine tag category class
271
+ tag_class = "subject-tag" # Default class
 
 
 
 
 
 
 
272
 
273
+ # Add specialized class based on category
274
+ if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
275
+ tag_class += " tag-language" # Languages
276
+ elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
277
+ tag_class += " tag-document-type" # Document types
278
+ elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
279
+ tag_class += " tag-subject" # Subject domains
280
+
281
+ # Add each tag as an inline span
282
+ tags_html += f'<span class="{tag_class}">{topic}</span>'
283
+
284
+ # Close the containers
285
+ tags_html += '</div></div>'
286
+
287
+ # Render the subject tags section
288
+ st.markdown(tags_html, unsafe_allow_html=True)
289
 
290
+ # No OCR content heading - start directly with tabs
 
291
 
292
+ # Check if we have OCR content
293
+ if 'ocr_contents' in result:
294
+ # Create a single view instead of tabs
295
+ content_tab1 = st.container()
296
+
297
+ # Check for images in the result to use later
298
+ has_images = result.get('has_images', False)
299
+ has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
300
+ has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and
301
+ any('images' in page for page in result['raw_response_data']['pages']
302
+ if isinstance(page, dict)))
303
 
304
+ # Display structured content
305
  with content_tab1:
306
+ # Display structured content with markdown formatting
307
  if isinstance(result['ocr_contents'], dict):
308
+ # CSS is now handled in the main layout.py file
309
+
310
+ # Function to process text with markdown support
311
+ def format_markdown_text(text):
312
+ """Format text with markdown and handle special patterns"""
313
+ if not text:
314
+ return ""
315
 
316
+ import re
317
+
318
+ # First, ensure we're working with a string
319
+ if not isinstance(text, str):
320
+ text = str(text)
321
+
322
+ # Ensure newlines are preserved for proper spacing
323
+ # Convert any Windows line endings to Unix
324
+ text = text.replace('\r\n', '\n')
325
+
326
+ # Format dates (MM/DD/YYYY or similar patterns)
327
+ date_pattern = r'\b(0?[1-9]|1[0-2])[\/\-\.](0?[1-9]|[12][0-9]|3[01])[\/\-\.](\d{4}|\d{2})\b'
328
+ text = re.sub(date_pattern, r'**\g<0>**', text)
329
+
330
+ # Detect markdown tables and preserve them
331
+ table_sections = []
332
+ non_table_lines = []
333
+ in_table = False
334
+ table_buffer = []
335
+
336
+ # Process text line by line, preserving tables
337
+ lines = text.split('\n')
338
+ for i, line in enumerate(lines):
339
+ line_stripped = line.strip()
340
+
341
+ # Detect table rows by pipe character
342
+ if '|' in line_stripped and (line_stripped.startswith('|') or line_stripped.endswith('|')):
343
+ if not in_table:
344
+ in_table = True
345
+ if table_buffer:
346
+ table_buffer = []
347
+ table_buffer.append(line)
348
+
349
+ # Check if the next line is a table separator
350
+ if i < len(lines) - 1 and '---' in lines[i+1] and '|' in lines[i+1]:
351
+ table_buffer.append(lines[i+1])
352
+
353
+ # Detect table separators (---|---|---)
354
+ elif in_table and '---' in line_stripped and '|' in line_stripped:
355
+ table_buffer.append(line)
356
+
357
+ # End of table detection
358
+ elif in_table:
359
+ # Check if this is still part of the table
360
+ next_line_is_table = False
361
+ if i < len(lines) - 1:
362
+ next_line = lines[i+1].strip()
363
+ if '|' in next_line and (next_line.startswith('|') or next_line.endswith('|')):
364
+ next_line_is_table = True
365
+
366
+ if not next_line_is_table:
367
+ in_table = False
368
+ # Save the complete table
369
+ if table_buffer:
370
+ table_sections.append('\n'.join(table_buffer))
371
+ table_buffer = []
372
+ # Add current line to non-table lines
373
+ non_table_lines.append(line)
374
+ else:
375
+ # Still part of the table
376
+ table_buffer.append(line)
377
+ else:
378
+ # Not in a table
379
+ non_table_lines.append(line)
380
+
381
+ # Handle any remaining table buffer
382
+ if in_table and table_buffer:
383
+ table_sections.append('\n'.join(table_buffer))
384
+
385
+ # Process non-table lines
386
+ processed_lines = []
387
+ for line in non_table_lines:
388
+ line_stripped = line.strip()
389
+
390
+ # Check if line is in ALL CAPS (and not just a short acronym)
391
+ if line_stripped and line_stripped.isupper() and len(line_stripped) > 3:
392
+ # ALL CAPS line - make bold instead of heading to prevent large display
393
+ processed_lines.append(f"**{line_stripped}**")
394
+ # Process potential headers (lines ending with colon)
395
+ elif line_stripped and line_stripped.endswith(':') and len(line_stripped) < 40:
396
+ # Likely a header - make it bold
397
+ processed_lines.append(f"**{line_stripped}**")
398
+ else:
399
+ # Keep original line with its spacing
400
+ processed_lines.append(line)
401
+
402
+ # Join non-table lines
403
+ processed_text = '\n'.join(processed_lines)
404
+
405
+ # Reinsert tables in the right positions
406
+ for table in table_sections:
407
+ # Generate a unique marker for this table
408
+ marker = f"__TABLE_MARKER_{hash(table) % 10000}__"
409
+ # Find a good position to insert this table
410
+ # For now, just append all tables at the end
411
+ processed_text += f"\n\n{table}\n\n"
412
+
413
+ # Make sure paragraphs have proper spacing but not excessive
414
+ processed_text = re.sub(r'\n{3,}', '\n\n', processed_text)
415
+
416
+ # Ensure two newlines between paragraphs for proper markdown rendering
417
+ processed_text = re.sub(r'([^\n])\n([^\n])', r'\1\n\n\2', processed_text)
418
+
419
+ return processed_text
420
+
421
+ # Collect all available images from the result
422
+ available_images = []
423
+ if has_images and 'pages_data' in result:
424
+ for page_idx, page in enumerate(result['pages_data']):
425
+ if 'images' in page and len(page['images']) > 0:
426
+ for img_idx, img in enumerate(page['images']):
427
+ if 'image_base64' in img:
428
+ available_images.append({
429
+ 'source': 'pages_data',
430
+ 'page': page_idx,
431
+ 'index': img_idx,
432
+ 'data': img['image_base64']
433
+ })
434
+
435
+ # Get images from raw response as well
436
+ if 'raw_response_data' in result:
437
+ raw_data = result['raw_response_data']
438
+ if isinstance(raw_data, dict) and 'pages' in raw_data:
439
+ for page_idx, page in enumerate(raw_data['pages']):
440
+ if isinstance(page, dict) and 'images' in page:
441
+ for img_idx, img in enumerate(page['images']):
442
+ if isinstance(img, dict) and 'base64' in img:
443
+ available_images.append({
444
+ 'source': 'raw_response',
445
+ 'page': page_idx,
446
+ 'index': img_idx,
447
+ 'data': img['base64']
448
+ })
449
+
450
+ # Extract images for display at the top
451
+ images_to_display = []
452
+
453
+ # First, collect all available images
454
+ for img_idx, img in enumerate(available_images):
455
+ if 'data' in img:
456
+ images_to_display.append({
457
+ 'data': img['data'],
458
+ 'id': img.get('id', f"img_{img_idx}"),
459
+ 'index': img_idx
460
+ })
461
+
462
+ # Display images at the top if available
463
+ if images_to_display:
464
+ st.markdown("### Document Images")
465
+ # Create columns for a grid layout (up to 2 columns to make images larger)
466
+ cols_count = min(2, len(images_to_display))
467
+ image_cols = st.columns(cols_count)
468
+
469
+ # Display each image in a column with minimal spacing
470
+ for i, img in enumerate(images_to_display):
471
+ with image_cols[i % cols_count]:
472
+ # Compact image display
473
+ st.image(img['data'], use_container_width=True)
474
+ st.markdown(f"<p style='margin-top:-5px; font-size:0.8rem; color:#666; text-align:center;'>Document Image {i+1}</p>", unsafe_allow_html=True)
475
+
476
+ # Organize sections in a logical order
477
+ section_order = ["title", "author", "date", "summary", "content", "transcript", "metadata"]
478
+ ordered_sections = []
479
+
480
+ # Add known sections first in preferred order
481
+ for section_name in section_order:
482
+ if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
483
+ ordered_sections.append(section_name)
484
+
485
+ # Add any remaining sections
486
+ for section in result['ocr_contents'].keys():
487
+ if (section not in ordered_sections and
488
+ section not in ['error', 'partial_text'] and
489
+ result['ocr_contents'][section]):
490
+ ordered_sections.append(section)
491
+
492
+ # If only raw_text is available and no other content, add it last
493
+ if ('raw_text' in result['ocr_contents'] and
494
+ result['ocr_contents']['raw_text'] and
495
+ len(ordered_sections) == 0):
496
+ ordered_sections.append('raw_text')
497
+
498
+ # Add minimal spacing before OCR results
499
+ st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True)
500
+ st.markdown("### Document Content")
501
+
502
+ # Process each section using expanders
503
+ for i, section in enumerate(ordered_sections):
504
+ content = result['ocr_contents'][section]
505
+
506
+ # Skip empty content
507
+ if not content:
508
+ continue
509
+
510
+ # Create an expander for each section
511
+ # First section is expanded by default
512
+ with st.expander(f"{section.replace('_', ' ').title()}", expanded=(i == 0)):
513
  if isinstance(content, str):
514
+ # Handle image markdown
515
+ if content.startswith("![") and content.endswith(")"):
516
+ try:
517
+ alt_text = content[2:content.index(']')]
518
+ st.info(f"Image description: {alt_text if len(alt_text) > 5 else 'Image'}")
519
+ except:
520
+ st.info("Contains image reference")
521
+ else:
522
+ # Process text content
523
+ formatted_content = format_markdown_text(content).strip()
524
+
525
+ # Check if content contains markdown tables or complex text
526
+ has_tables = '|' in formatted_content and '---' in formatted_content
527
+ has_complex_structure = formatted_content.count('\n') > 5 or formatted_content.count('**') > 2
528
+
529
+ # Use a container with minimal margins
530
+ with st.container():
531
+ # For text-only extractions or content with tables, ensure proper rendering
532
+ if has_tables or has_complex_structure:
533
+ # For text with tables or multiple paragraphs, use special handling
534
+ # First ensure proper markdown spacing
535
+ formatted_content = formatted_content.replace('\n\n\n', '\n\n')
536
+
537
+ # Look for any all caps headers that might be misinterpreted
538
+ import re
539
+ formatted_content = re.sub(
540
+ r'^([A-Z][A-Z\s]+)$',
541
+ r'**\1**',
542
+ formatted_content,
543
+ flags=re.MULTILINE
544
+ )
545
+
546
+ # Preserve table formatting by adding proper spacing
547
+ if has_tables:
548
+ formatted_content = formatted_content.replace('\n|', '\n\n|')
549
+
550
+ # Add proper paragraph spacing
551
+ formatted_content = re.sub(r'([^\n])\n([^\n])', r'\1\n\n\2', formatted_content)
552
+
553
+ # Use standard markdown with custom styling
554
+ st.markdown(formatted_content, unsafe_allow_html=False)
555
+ else:
556
+ # For simpler content, use standard markdown
557
+ st.markdown(formatted_content)
558
+
559
  elif isinstance(content, list):
560
+ # Create markdown list
561
+ list_items = []
562
  for item in content:
563
  if isinstance(item, str):
564
+ item_text = format_markdown_text(item).strip()
565
+ # Handle potential HTML special characters for proper rendering
566
+ item_text = item_text.replace('<', '&lt;').replace('>', '&gt;')
567
+ list_items.append(f"- {item_text}")
568
  else:
569
+ list_items.append(f"- {str(item)}")
570
+
571
+ list_content = "\n".join(list_items)
572
+
573
+ # Use a container with minimal margins
574
+ with st.container():
575
+ # Use standard markdown for better rendering
576
+ st.markdown(list_content)
577
+
578
  elif isinstance(content, dict):
579
+ # Format dictionary content
580
+ dict_items = []
581
  for k, v in content.items():
582
+ key_formatted = k.replace('_', ' ').title()
583
+
584
+ if isinstance(v, str):
585
+ value_formatted = format_markdown_text(v).strip()
586
+ dict_items.append(f"**{key_formatted}:** {value_formatted}")
587
+ else:
588
+ dict_items.append(f"**{key_formatted}:** {str(v)}")
589
+
590
+ dict_content = "\n".join(dict_items)
591
+
592
+ # Use a container with minimal margins
593
+ with st.container():
594
+ # Use standard markdown for better rendering
595
+ st.markdown(dict_content)
596
 
597
+ # Display custom prompt if provided
598
+ if custom_prompt:
599
+ with st.expander("Custom Processing Instructions"):
600
+ st.write(custom_prompt)
601
+
602
+ # No download heading - start directly with buttons
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
 
604
+ # JSON download - use full width for buttons
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
  try:
606
  json_str = json.dumps(result, indent=2)
607
  st.download_button(
 
612
  )
613
  except Exception as e:
614
  st.error(f"Error creating JSON download: {str(e)}")
615
+
 
616
  # Text download
617
  try:
618
  if 'ocr_contents' in result:
 
636
 
637
  def display_document_with_images(result):
638
  """Display document with images"""
639
+ # Check for pages_data first
640
+ if 'pages_data' in result and result['pages_data']:
641
+ pages_data = result['pages_data']
642
+ # If pages_data not available, try to extract from raw_response_data
643
+ elif 'raw_response_data' in result and isinstance(result['raw_response_data'], dict) and 'pages' in result['raw_response_data']:
644
+ # Build pages_data from raw_response_data
645
+ pages_data = []
646
+ raw_pages = result['raw_response_data']['pages']
647
+
648
+ for page_idx, page in enumerate(raw_pages):
649
+ if not isinstance(page, dict):
650
+ continue
651
+
652
+ page_data = {
653
+ 'page_number': page_idx + 1,
654
+ 'markdown': page.get('markdown', ''),
655
+ 'images': []
656
+ }
657
+
658
+ # Extract images if present
659
+ if 'images' in page and isinstance(page['images'], list):
660
+ for img_idx, img in enumerate(page['images']):
661
+ if isinstance(img, dict) and ('base64' in img or 'image_base64' in img):
662
+ img_base64 = img.get('image_base64', img.get('base64', ''))
663
+ if img_base64:
664
+ page_data['images'].append({
665
+ 'id': img.get('id', f"img_{page_idx}_{img_idx}"),
666
+ 'image_base64': img_base64
667
+ })
668
+
669
+ if page_data['markdown'] or page_data['images']:
670
+ pages_data.append(page_data)
671
+ else:
672
  st.info("No image data available.")
673
  return
674
 
675
  # Display each page
676
+ for i, page_data in enumerate(pages_data):
677
  st.markdown(f"### Page {i+1}")
678
 
679
  # Create columns for image and text
680
  img_col, text_col = st.columns([1, 1])
681
 
682
  with img_col:
683
+ # Display the image - check multiple possible field names
684
+ image_displayed = False
685
+
686
+ # Try 'image_data' field first
687
  if 'image_data' in page_data:
688
  try:
689
  # Convert base64 to image
690
  image_data = base64.b64decode(page_data['image_data'])
691
  st.image(io.BytesIO(image_data), use_container_width=True)
692
+ image_displayed = True
693
  except Exception as e:
694
+ st.error(f"Error displaying image from image_data: {str(e)}")
695
+
696
+ # Try 'images' array if image_data didn't work
697
+ if not image_displayed and 'images' in page_data and len(page_data['images']) > 0:
698
+ for img in page_data['images']:
699
+ if 'image_base64' in img:
700
+ try:
701
+ st.image(img['image_base64'], use_container_width=True)
702
+ image_displayed = True
703
+ break
704
+ except Exception as e:
705
+ st.error(f"Error displaying image from images array: {str(e)}")
706
+
707
+ if not image_displayed:
708
  st.info("No image available for this page.")
709
 
710
  with text_col:
711
+ # Get text from various possible fields
712
+ page_text = ""
713
  if 'text' in page_data:
714
+ page_text = page_data['text']
715
+ elif 'markdown' in page_data:
716
+ page_text = page_data['markdown']
717
+
718
+ # Special handling for image markdown in page data
719
+ if page_text.startswith("![") and page_text.endswith(")"):
720
+ # Try to display image if not already displayed
721
+ if not image_displayed and 'raw_response_data' in result:
722
+ raw_data = result['raw_response_data']
723
+ if isinstance(raw_data, dict) and 'pages' in raw_data:
724
+ for raw_page in raw_data['pages']:
725
+ if isinstance(raw_page, dict) and 'images' in raw_page:
726
+ for img in raw_page['images']:
727
+ if isinstance(img, dict) and 'base64' in img:
728
+ st.image(img['base64'])
729
+ st.caption("Image from OCR response")
730
+ image_displayed = True
731
+ break
732
+ if image_displayed:
733
+ break
734
+
735
+ # Try to extract alt text
736
+ try:
737
+ alt_text = page_text[2:page_text.index(']')]
738
+ if alt_text and len(alt_text) > 5: # Only show if alt text is meaningful
739
+ st.info(f"Image description: {alt_text}")
740
+ else:
741
+ st.info("This page contains an image with minimal text")
742
+ except:
743
+ st.info("This page contains an image with minimal text")
744
+
745
+ # Show warning if no image displayed
746
+ if not image_displayed:
747
+ st.warning("Image reference found in text, but no image data is available.")
748
+
749
+ # If no text found but we have raw_text in ocr_contents
750
+ if not page_text and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
751
+ page_text = result['ocr_contents']['raw_text']
752
+
753
+ # Display the text with editing capability
754
+ if page_text:
755
+ edited_text = st.text_area(f"Page {i+1} Text", page_text, height=300, key=f"page_text_{i}")
756
 
757
+ # Add a simple button to copy the edited text to clipboard
758
+ st.button(f"Copy Text", key=f"copy_btn_{i}")
 
759
  else:
760
  st.info("No text available for this page.")
761
 
762
  def display_previous_results():
763
+ """Display previous results tab content in a simplified, structured view"""
 
764
 
765
+ # Use a clean header with the download button directly next to it
766
+ col1, col2 = st.columns([3, 1])
767
+ with col1:
768
+ st.header("Previous Results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769
 
770
  # Display previous results if available
771
  if not st.session_state.previous_results:
772
  st.markdown("""
773
+ <div style="text-align: center; padding: 30px 20px; background-color: #f8f9fa; border-radius: 6px; margin-top: 10px;">
774
+ <div style="font-size: 36px; margin-bottom: 15px;">📄</div>
775
+ <h4 style="margin-bottom: 8px; font-weight: 500;">No Previous Results</h4>
776
+ <p style="font-size: 14px; color: #666;">Process a document to see your results history.</p>
777
  </div>
778
  """, unsafe_allow_html=True)
779
  else:
780
+ # Add download button in the second column next to the header
781
+ with col2:
782
+ try:
783
+ # Create download button for all results
784
+ from ocr_utils import create_results_zip_in_memory
785
+ zip_data = create_results_zip_in_memory(st.session_state.previous_results)
786
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
 
 
 
 
 
 
787
 
788
+ # Simplified filename
789
+ zip_filename = f"ocr_results_{timestamp}.zip"
790
+
791
+ st.download_button(
792
+ label="Download All",
793
+ data=zip_data,
794
+ file_name=zip_filename,
795
+ mime="application/zip",
796
+ help="Download all results as ZIP"
797
+ )
798
+ except Exception:
799
+ # Silent fail - no error message to keep UI clean
800
+ pass
801
 
802
+ # Create a cleaner, more minimal grid for results using Streamlit columns
803
+ # Calculate number of columns based on screen width - more responsive
804
+ num_columns = 2 # Two columns for most screens
805
+
806
+ # Create rows of result cards
807
+ for i in range(0, len(st.session_state.previous_results), num_columns):
808
+ # Create a row of columns
809
+ cols = st.columns(num_columns)
810
+
811
+ # Fill each column with a result card
812
+ for j in range(num_columns):
813
+ index = i + j
814
+ if index < len(st.session_state.previous_results):
815
+ result = st.session_state.previous_results[index]
816
 
817
+ # Get basic info for the card
818
+ file_name = result.get("file_name", f"Document {index+1}")
819
+ timestamp = result.get("timestamp", "")
820
 
821
+ # Determine file type icon
822
+ if file_name.lower().endswith(".pdf"):
823
+ icon = "📄"
824
+ elif any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif"]):
825
+ icon = "🖼️"
 
 
826
  else:
827
+ icon = "📝"
828
 
829
+ # Display a simplified card in each column
830
+ with cols[j]:
831
+ # Use a container for better styling control
832
+ with st.container():
833
+ # Create visually cleaner card with less vertical space
834
+ st.markdown(f"""
835
+ <div style="padding: 10px; border: 1px solid #e0e0e0; border-radius: 6px; margin-bottom: 10px;">
836
+ <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 5px;">
837
+ <div style="font-weight: 500; font-size: 14px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">{icon} {file_name}</div>
838
+ <div style="color: #666; font-size: 12px;">{timestamp.split()[0] if timestamp else ""}</div>
839
+ </div>
840
+ </div>
841
+ """, unsafe_allow_html=True)
842
+
843
+ # Add a simple button below each card
844
+ if st.button(f"View", key=f"view_{index}", help=f"View {file_name}"):
845
+ st.session_state.selected_previous_result = st.session_state.previous_results[index]
846
+ st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
847
 
848
  # Display the selected result if available
849
  if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
850
  selected_result = st.session_state.selected_previous_result
851
 
852
+ # Draw a separator between results list and selected document
853
+ st.markdown("<hr style='margin: 20px 0 15px 0; border: none; height: 1px; background-color: #eee;'>", unsafe_allow_html=True)
854
+
855
+ # Create a cleaner header for the selected document
856
+ file_name = selected_result.get('file_name', 'Document')
857
+ st.subheader(f"{file_name}")
858
+
859
+ # Add a simple back button at the top
860
+ if st.button("← Back to Results", key="back_to_results"):
861
+ if 'selected_previous_result' in st.session_state:
862
+ del st.session_state.selected_previous_result
863
+ st.session_state.perform_reset = True
864
+ st.rerun()
865
+
866
+ # Simplified metadata display - just one line with essential info
867
+ meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 12px; margin: 8px 0 15px 0; font-size: 14px; color: #666;">'
868
+
869
+ # Add timestamp
870
+ if 'timestamp' in selected_result:
871
+ meta_html += f'<div>{selected_result["timestamp"]}</div>'
872
 
873
+ # Add languages if available (simplified)
874
+ if 'languages' in selected_result and selected_result['languages']:
875
+ languages = [lang for lang in selected_result['languages'] if lang is not None]
876
+ if languages:
877
+ meta_html += f'<div>Language: {", ".join(languages)}</div>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878
 
879
+ # Add page count if available (simplified)
880
+ if 'limited_pages' in selected_result:
881
+ meta_html += f'<div>Pages: {selected_result["limited_pages"]["processed"]}/{selected_result["limited_pages"]["total"]}</div>'
 
882
 
883
+ meta_html += '</div>'
884
+ st.markdown(meta_html, unsafe_allow_html=True)
 
885
 
886
+ # Simplified tabs - fewer options for cleaner interface
887
  has_images = selected_result.get('has_images', False)
888
  if has_images:
889
+ view_tabs = st.tabs(["Document Content", "Raw Text", "Images"])
890
+ view_tab1, view_tab2, view_tab3 = view_tabs
891
  else:
892
+ view_tabs = st.tabs(["Document Content", "Raw Text"])
893
+ view_tab1, view_tab2 = view_tabs
894
 
895
+ # Define helper function for formatting text
896
+ def format_text_display(text):
897
+ if not isinstance(text, str):
898
+ return text
899
+
900
+ lines = text.split('\n')
901
+ processed_lines = []
902
+ for line in lines:
903
+ line_stripped = line.strip()
904
+ if line_stripped and line_stripped.isupper() and len(line_stripped) > 3:
905
+ processed_lines.append(f"**{line_stripped}**")
906
+ else:
907
+ processed_lines.append(line)
908
+
909
+ return '\n'.join(processed_lines)
910
+
911
+ # First tab - Document Content (simplified structured view)
912
  with view_tab1:
913
+ # Display content in a cleaner, more streamlined format
914
  if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
915
+ # Create a more focused list of important sections
916
+ priority_sections = ["title", "content", "transcript", "summary", "raw_text"]
917
+ displayed_sections = set()
918
+
919
+ # First display priority sections
920
+ for section in priority_sections:
921
+ if section in selected_result['ocr_contents'] and selected_result['ocr_contents'][section]:
922
+ content = selected_result['ocr_contents'][section]
923
+ if isinstance(content, str) and content.strip():
924
+ # Only add a subheader for meaningful section names, not raw_text
925
+ if section != "raw_text":
926
+ st.markdown(f"##### {section.replace('_', ' ').title()}")
927
+
928
+ # Format and display content
929
+ formatted_content = format_text_display(content)
930
+ st.markdown(formatted_content)
931
+ displayed_sections.add(section)
932
+
933
+ # Then display any remaining sections not already shown
934
  for section, content in selected_result['ocr_contents'].items():
935
+ if (section not in displayed_sections and
936
+ section not in ['error', 'partial_text'] and
937
+ content):
938
+ st.markdown(f"##### {section.replace('_', ' ').title()}")
939
 
940
  if isinstance(content, str):
941
+ st.markdown(format_text_display(content))
942
  elif isinstance(content, list):
943
  for item in content:
944
+ st.markdown(f"- {item}")
 
 
 
945
  elif isinstance(content, dict):
946
  for k, v in content.items():
947
+ st.markdown(f"**{k}:** {v}")
948
 
949
+ # Second tab - Raw Text (simplified)
950
  with view_tab2:
951
+ # Extract raw text or content
952
  raw_text = ""
953
  if 'ocr_contents' in selected_result:
954
  if 'raw_text' in selected_result['ocr_contents']:
 
956
  elif 'content' in selected_result['ocr_contents']:
957
  raw_text = selected_result['ocr_contents']['content']
958
 
959
+ # Display the text area with raw text
960
+ edited_text = st.text_area("", raw_text, height=300, key="selected_raw_text")
 
 
 
 
961
 
962
+ # Add buttons in a row
963
+ col1, col2 = st.columns(2)
964
+ with col1:
965
+ st.button("Copy Text", key="selected_copy_btn")
966
+ with col2:
967
+ st.download_button(
968
+ label="Download Text",
969
+ data=edited_text,
970
+ file_name=f"{file_name.split('.')[0]}_text.txt",
971
+ mime="text/plain",
972
+ key="selected_download_btn"
973
+ )
974
 
975
+ # Third tab - With Images (simplified)
976
  if has_images and 'pages_data' in selected_result:
977
  with view_tab3:
978
+ # Simplified image display
979
+ if 'pages_data' in selected_result:
980
+ for i, page_data in enumerate(selected_result['pages_data']):
981
+ # Display each page
982
+ if 'images' in page_data and len(page_data['images']) > 0:
983
+ for img in page_data['images']:
984
+ if 'image_base64' in img:
985
+ st.image(img['image_base64'], use_column_width=True)
986
+
987
+ # Get page text if available
988
+ page_text = ""
989
+ if 'markdown' in page_data:
990
+ page_text = page_data['markdown']
991
+
992
+ # Display text if available
993
+ if page_text:
994
+ with st.expander(f"Page {i+1} Text", expanded=False):
995
+ st.text(page_text)
996
 
997
  def display_about_tab():
998
  """Display about tab content"""
999
+ st.header("About")
1000
 
1001
  # Add app description
1002
  st.markdown("""
1003
  **Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
1004
+ """)
1005
 
1006
+ # Purpose section with consistent formatting
1007
+ st.markdown("### Purpose")
1008
+ st.markdown("""
1009
  This tool is designed to assist scholars in historical research by extracting text from challenging documents.
1010
  While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
1011
  historical documents, particularly:
1012
+ """)
1013
 
1014
+ st.markdown("""
1015
  - **Historical newspapers** with complex layouts and aged text
1016
  - **Handwritten documents** from various time periods
1017
  - **Photos of archival materials** that may be difficult to read
1018
+ """)
1019
 
1020
+ # Features section with consistent formatting
1021
+ st.markdown("### Features")
1022
+ st.markdown("""
1023
  - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
1024
  - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
1025
  - **Editable Results**: Review and edit extracted text directly in the interface
1026
  - **Structured Content Analysis**: Automatic organization of document content
1027
  - **Multi-language Support**: Process documents in various languages
1028
  - **PDF Processing**: Handle multi-page historical documents
1029
+ """)
1030
 
1031
+ # How to Use section with consistent formatting
1032
+ st.markdown("### How to Use")
1033
+ st.markdown("""
1034
  1. Upload a document (PDF or image)
1035
  2. Select the document type and adjust preprocessing options if needed
1036
  3. Add custom processing instructions for specialized documents
1037
  4. Process the document
1038
  5. Review, edit, and download the results
1039
+ """)
1040
 
1041
+ # Technologies section with consistent formatting
1042
+ st.markdown("### Technologies")
1043
+ st.markdown("""
1044
  - OCR processing using Mistral AI's advanced document understanding capabilities
1045
  - Image preprocessing with OpenCV
1046
  - PDF handling with pdf2image
utils.py CHANGED
@@ -13,12 +13,76 @@ logger = logging.getLogger("utils")
13
  logger.setLevel(logging.INFO)
14
 
15
  def get_base64_from_image(image_path):
16
- """Get base64 string from image file"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  try:
18
- with open(image_path, "rb") as img_file:
19
- return base64.b64encode(img_file.read()).decode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  except Exception as e:
21
- logger.error(f"Error encoding image to base64: {str(e)}")
22
  return ""
23
 
24
  def timing(description):
 
13
  logger.setLevel(logging.INFO)
14
 
15
  def get_base64_from_image(image_path):
16
+ """
17
+ Get base64 data URL from image file with proper MIME type.
18
+
19
+ Args:
20
+ image_path: Path to the image file
21
+
22
+ Returns:
23
+ Base64 data URL with appropriate MIME type prefix
24
+ """
25
+ try:
26
+ # Convert to Path object for better handling
27
+ path_obj = Path(image_path)
28
+
29
+ # Determine mime type based on file extension
30
+ mime_type = 'image/jpeg' # Default mime type
31
+ suffix = path_obj.suffix.lower()
32
+ if suffix == '.png':
33
+ mime_type = 'image/png'
34
+ elif suffix == '.gif':
35
+ mime_type = 'image/gif'
36
+ elif suffix in ['.jpg', '.jpeg']:
37
+ mime_type = 'image/jpeg'
38
+ elif suffix == '.pdf':
39
+ mime_type = 'application/pdf'
40
+
41
+ # Read and encode file
42
+ with open(path_obj, "rb") as file:
43
+ encoded = base64.b64encode(file.read()).decode('utf-8')
44
+ return f"data:{mime_type};base64,{encoded}"
45
+ except Exception as e:
46
+ logger.error(f"Error encoding file to base64: {str(e)}")
47
+ return ""
48
+
49
+ def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None):
50
+ """
51
+ Get base64 data URL from file bytes with proper MIME type.
52
+
53
+ Args:
54
+ file_bytes: Binary file data
55
+ mime_type: MIME type of the file (optional)
56
+ file_name: Original file name for MIME type detection (optional)
57
+
58
+ Returns:
59
+ Base64 data URL with appropriate MIME type prefix
60
+ """
61
  try:
62
+ # Determine mime type if not provided
63
+ if mime_type is None and file_name is not None:
64
+ # Get file extension
65
+ suffix = Path(file_name).suffix.lower()
66
+ if suffix == '.png':
67
+ mime_type = 'image/png'
68
+ elif suffix == '.gif':
69
+ mime_type = 'image/gif'
70
+ elif suffix in ['.jpg', '.jpeg']:
71
+ mime_type = 'image/jpeg'
72
+ elif suffix == '.pdf':
73
+ mime_type = 'application/pdf'
74
+ else:
75
+ # Default to octet-stream for unknown types
76
+ mime_type = 'application/octet-stream'
77
+ elif mime_type is None:
78
+ # Default MIME type if we can't determine it
79
+ mime_type = 'application/octet-stream'
80
+
81
+ # Encode and create data URL
82
+ encoded = base64.b64encode(file_bytes).decode('utf-8')
83
+ return f"data:{mime_type};base64,{encoded}"
84
  except Exception as e:
85
+ logger.error(f"Error encoding bytes to base64: {str(e)}")
86
  return ""
87
 
88
  def timing(description):