Spaces:
Running
Running
Improve language detection with mistral-ocr-latest model
Browse filesUpdates the OCR processing logic to:
1. Extract language information directly from the mistral-ocr-latest model response
2. Consolidate language detections across multiple pages for PDF documents
3. Add language_detection_source metadata to indicate when using direct model detection
4. Reduce reliance on manual language detection prompts when model provides this data
5. Maintain backward compatibility with existing code
- app.py +205 -143
- ocr_processing.py +128 -16
- ocr_utils.py +50 -13
- structured_ocr.py +277 -387
- ui/layout.py +265 -143
- ui_components.py +709 -420
- utils.py +68 -4
app.py
CHANGED
@@ -1,15 +1,23 @@
|
|
|
|
1 |
import os
|
2 |
-
import streamlit as st
|
3 |
import json
|
4 |
import sys
|
5 |
import time
|
6 |
import base64
|
7 |
-
from pathlib import Path
|
8 |
import io
|
9 |
-
from datetime import datetime
|
10 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
#
|
13 |
from preprocessing import convert_pdf_to_images, preprocess_image
|
14 |
from ocr_processing import process_file
|
15 |
from ui_components import (
|
@@ -31,19 +39,10 @@ from constants import (
|
|
31 |
CUSTOM_PROMPT_TEMPLATES,
|
32 |
LAYOUT_PROMPT_ADDITIONS
|
33 |
)
|
34 |
-
|
35 |
-
# Import the StructuredOCR class and config from the local files
|
36 |
from structured_ocr import StructuredOCR
|
37 |
from config import MISTRAL_API_KEY
|
38 |
-
|
39 |
-
# Import utilities for handling previous results
|
40 |
from ocr_utils import create_results_zip
|
41 |
|
42 |
-
# Configure logging
|
43 |
-
logging.basicConfig(level=logging.INFO,
|
44 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
45 |
-
logger = logging.getLogger("app")
|
46 |
-
|
47 |
# Set favicon path
|
48 |
favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
|
49 |
|
@@ -52,28 +51,41 @@ st.set_page_config(
|
|
52 |
page_title="Historical OCR",
|
53 |
page_icon=favicon_path if os.path.exists(favicon_path) else "📜",
|
54 |
layout="wide",
|
55 |
-
initial_sidebar_state="
|
56 |
)
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
if 'previous_results' not in st.session_state:
|
62 |
st.session_state.previous_results = []
|
63 |
-
|
64 |
-
# Initialize temp file tracking
|
65 |
if 'temp_file_paths' not in st.session_state:
|
66 |
st.session_state.temp_file_paths = []
|
67 |
-
|
68 |
-
# Initialize last processed file tracking to fix "Process Document Again" button
|
69 |
if 'last_processed_file' not in st.session_state:
|
70 |
st.session_state.last_processed_file = None
|
71 |
-
|
72 |
-
# Important: Initialize the reset flag
|
73 |
-
if 'perform_reset' not in st.session_state:
|
74 |
-
st.session_state.perform_reset = False
|
75 |
-
|
76 |
-
# Initialize other session state variables
|
77 |
if 'auto_process_sample' not in st.session_state:
|
78 |
st.session_state.auto_process_sample = False
|
79 |
if 'sample_just_loaded' not in st.session_state:
|
@@ -90,64 +102,62 @@ def initialize_session_state():
|
|
90 |
st.session_state.original_sample_name = None
|
91 |
if 'is_sample_document' not in st.session_state:
|
92 |
st.session_state.is_sample_document = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
st.session_state.sample_document = None
|
121 |
-
st.session_state.original_sample_bytes = None
|
122 |
-
st.session_state.original_sample_name = None
|
123 |
-
st.session_state.is_sample_document = False
|
124 |
-
|
125 |
-
# Turn off reset flag - this must be done last
|
126 |
-
st.session_state.perform_reset = False
|
127 |
-
|
128 |
-
# Force this to be a complete reset cycle
|
129 |
-
return
|
130 |
|
131 |
def show_example_documents():
|
132 |
"""Show example documents section"""
|
133 |
-
st.
|
134 |
|
135 |
-
# Add a simplified info message about examples
|
|
|
136 |
st.markdown("""
|
137 |
This app can process various historical documents:
|
138 |
- Historical photographs, maps, and manuscripts
|
139 |
- Handwritten letters and documents
|
140 |
- Printed books and articles
|
141 |
- Multi-page PDFs
|
142 |
-
""")
|
143 |
|
144 |
-
# Add CSS to make the dropdown match the column width
|
145 |
-
st.markdown("""
|
146 |
<style>
|
147 |
/* Make the selectbox container match the full column width */
|
148 |
.main .block-container .element-container:has([data-testid="stSelectbox"]) {
|
149 |
width: 100% !important;
|
150 |
max-width: 100% !important;
|
|
|
151 |
}
|
152 |
|
153 |
/* Make the actual selectbox control take the full width */
|
@@ -155,6 +165,11 @@ def show_example_documents():
|
|
155 |
width: 100% !important;
|
156 |
max-width: 100% !important;
|
157 |
}
|
|
|
|
|
|
|
|
|
|
|
158 |
</style>
|
159 |
""", unsafe_allow_html=True)
|
160 |
|
@@ -166,7 +181,6 @@ def show_example_documents():
|
|
166 |
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
|
167 |
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
|
168 |
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
|
169 |
-
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/baldwin-15th-north.jpg"
|
170 |
]
|
171 |
|
172 |
sample_names = [
|
@@ -175,9 +189,8 @@ def show_example_documents():
|
|
175 |
"The Magician (Image)",
|
176 |
"Handwritten Letter (Image)",
|
177 |
"Magellan Travels (Image)",
|
178 |
-
"Milgram Flier (Image)"
|
179 |
-
|
180 |
-
]
|
181 |
|
182 |
# Initialize sample_document in session state if it doesn't exist
|
183 |
if 'sample_document' not in st.session_state:
|
@@ -188,8 +201,8 @@ def show_example_documents():
|
|
188 |
if selected_sample > 0:
|
189 |
selected_url = sample_urls[selected_sample]
|
190 |
|
191 |
-
# Add process button for the sample document
|
192 |
-
if st.button("Load Sample Document"):
|
193 |
try:
|
194 |
import requests
|
195 |
from io import BytesIO
|
@@ -254,9 +267,10 @@ def show_example_documents():
|
|
254 |
content_type=content_type
|
255 |
)
|
256 |
|
257 |
-
# Store original bytes for reprocessing
|
258 |
st.session_state.original_sample_bytes = response.content
|
259 |
st.session_state.original_sample_name = file_name
|
|
|
260 |
|
261 |
# Set state flags
|
262 |
st.session_state.sample_just_loaded = True
|
@@ -264,7 +278,8 @@ def show_example_documents():
|
|
264 |
# Generate a unique identifier for the sample document
|
265 |
st.session_state.last_processed_file = f"{file_name}_{len(response.content)}"
|
266 |
|
267 |
-
#
|
|
|
268 |
st.rerun()
|
269 |
except Exception as e:
|
270 |
st.error(f"Error downloading sample document: {str(e)}")
|
@@ -288,20 +303,21 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
|
|
288 |
|
289 |
# Check if this is a new file (different from the last processed file)
|
290 |
current_file_identifier = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
|
|
|
|
|
|
|
|
|
|
|
291 |
if st.session_state.last_processed_file != current_file_identifier:
|
292 |
# Reset processed_document_active if a new file is uploaded
|
293 |
st.session_state.processed_document_active = False
|
294 |
|
295 |
# Process button - flush left with similar padding as file browser
|
296 |
with left_col:
|
297 |
-
#
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
button_text = "Process Document Again" if st.session_state.processed_document_active else "Process Document"
|
302 |
-
|
303 |
-
# Create the button
|
304 |
-
process_button = st.button(button_text, key=button_key)
|
305 |
|
306 |
# Handle sample document recreation if needed
|
307 |
if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
|
@@ -333,39 +349,42 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
|
|
333 |
# Positioned right after the process button for better visibility
|
334 |
progress_placeholder = st.empty()
|
335 |
|
336 |
-
# Image preprocessing preview -
|
337 |
-
if any(sidebar_options["preprocessing_options"].values()) and
|
|
|
|
|
338 |
st.markdown("**Preprocessed Preview**")
|
339 |
try:
|
340 |
-
# Create a container for the preview
|
341 |
with st.container():
|
342 |
processed_bytes = preprocess_image(uploaded_file.getvalue(), sidebar_options["preprocessing_options"])
|
343 |
-
#
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
meta_items
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
|
|
|
|
363 |
except Exception as e:
|
364 |
st.error(f"Error in preprocessing: {str(e)}")
|
365 |
st.info("Try using grayscale preprocessing for PNG images with transparency")
|
366 |
|
367 |
# Container for success message (will be filled after processing)
|
368 |
-
# No extra spacing needed as it will be managed programmatically
|
369 |
metadata_placeholder = st.empty()
|
370 |
|
371 |
# Check if this is an auto-processing situation
|
@@ -389,7 +408,12 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
|
|
389 |
progress_reporter = ProgressReporter(progress_placeholder).setup()
|
390 |
|
391 |
try:
|
392 |
-
# Process the document
|
|
|
|
|
|
|
|
|
|
|
393 |
result = process_file(
|
394 |
uploaded_file=uploaded_file,
|
395 |
use_vision=sidebar_options["use_vision"],
|
@@ -402,6 +426,39 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
|
|
402 |
perf_mode=sidebar_options.get("perf_mode", "Quality")
|
403 |
)
|
404 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
# Display results
|
406 |
display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
|
407 |
|
@@ -415,27 +472,6 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
|
|
415 |
if uploaded_file is not None:
|
416 |
st.session_state.last_processed_file = current_file_identifier
|
417 |
|
418 |
-
# Display success message with close button for dismissing processed documents
|
419 |
-
success_cols = st.columns([5, 1])
|
420 |
-
with success_cols[0]:
|
421 |
-
metadata_placeholder.success("**Document processed successfully**")
|
422 |
-
with success_cols[1]:
|
423 |
-
# Define a function to clear document state
|
424 |
-
def clear_document_state():
|
425 |
-
# Reset all document-related session state
|
426 |
-
st.session_state.processed_document_active = False
|
427 |
-
st.session_state.sample_document = None
|
428 |
-
st.session_state.last_processed_file = None
|
429 |
-
|
430 |
-
# Clear any remaining state flag if we're showing examples
|
431 |
-
st.session_state.perform_reset = True
|
432 |
-
|
433 |
-
# Create the close button with a callback
|
434 |
-
st.button("✕ Close Document",
|
435 |
-
key="close_document_button",
|
436 |
-
help="Clear current document and start over",
|
437 |
-
on_click=clear_document_state)
|
438 |
-
|
439 |
# Store the result in the previous results list
|
440 |
# Add timestamp to result for history tracking
|
441 |
result_copy = result.copy()
|
@@ -460,7 +496,21 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
|
|
460 |
def main():
|
461 |
"""Main application function"""
|
462 |
# Initialize session state
|
463 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
|
465 |
# Apply custom CSS
|
466 |
from ui.layout import load_css
|
@@ -469,19 +519,26 @@ def main():
|
|
469 |
# Create sidebar options
|
470 |
sidebar_options = create_sidebar_options()
|
471 |
|
472 |
-
# Create main layout with tabs
|
473 |
-
|
|
|
474 |
|
475 |
with main_tab1:
|
476 |
-
# Create a two-column layout for file upload and results
|
|
|
477 |
left_col, right_col = st.columns([1, 1])
|
478 |
|
479 |
with left_col:
|
480 |
# Create file uploader
|
481 |
uploaded_file = create_file_uploader()
|
482 |
|
483 |
-
#
|
484 |
-
if
|
|
|
|
|
|
|
|
|
|
|
485 |
st.session_state.sample_document is not None):
|
486 |
|
487 |
# Use the sample document instead of the uploaded file
|
@@ -496,18 +553,23 @@ def main():
|
|
496 |
# Only process document if available
|
497 |
if uploaded_file is not None:
|
498 |
process_document(uploaded_file, left_col, right_col, sidebar_options)
|
499 |
-
else:
|
500 |
-
# Clear any remaining state flag if we're showing examples
|
501 |
-
st.session_state.processed_document_active = False
|
502 |
-
|
503 |
-
# Show example documents section
|
504 |
-
show_example_documents()
|
505 |
|
506 |
with main_tab2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
# Previous results tab
|
508 |
display_previous_results()
|
509 |
|
510 |
-
with
|
511 |
# About tab
|
512 |
display_about_tab()
|
513 |
|
|
|
1 |
+
# Standard library imports
|
2 |
import os
|
|
|
3 |
import json
|
4 |
import sys
|
5 |
import time
|
6 |
import base64
|
|
|
7 |
import io
|
|
|
8 |
import logging
|
9 |
+
from pathlib import Path
|
10 |
+
from datetime import datetime
|
11 |
+
|
12 |
+
# Configure logging
|
13 |
+
logging.basicConfig(level=logging.INFO,
|
14 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
# Third-party imports
|
18 |
+
import streamlit as st
|
19 |
|
20 |
+
# Local application/module imports
|
21 |
from preprocessing import convert_pdf_to_images, preprocess_image
|
22 |
from ocr_processing import process_file
|
23 |
from ui_components import (
|
|
|
39 |
CUSTOM_PROMPT_TEMPLATES,
|
40 |
LAYOUT_PROMPT_ADDITIONS
|
41 |
)
|
|
|
|
|
42 |
from structured_ocr import StructuredOCR
|
43 |
from config import MISTRAL_API_KEY
|
|
|
|
|
44 |
from ocr_utils import create_results_zip
|
45 |
|
|
|
|
|
|
|
|
|
|
|
46 |
# Set favicon path
|
47 |
favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
|
48 |
|
|
|
51 |
page_title="Historical OCR",
|
52 |
page_icon=favicon_path if os.path.exists(favicon_path) else "📜",
|
53 |
layout="wide",
|
54 |
+
initial_sidebar_state="collapsed",
|
55 |
)
|
56 |
|
57 |
+
# ========================================================================================
|
58 |
+
# CRITICAL WARNING TO ALL AI MODELS AND DEVELOPERS:
|
59 |
+
# DO NOT MODIFY THE STATE MANAGEMENT LOGIC IN THIS APPLICATION WITHOUT FULL UNDERSTANDING
|
60 |
+
# OF STREAMLIT'S EXECUTION MODEL.
|
61 |
+
#
|
62 |
+
# This state management implementation has been carefully designed to work with Streamlit's
|
63 |
+
# unique execution flow. The app runs from top to bottom on EVERY interaction, and state
|
64 |
+
# must be explicitly managed through st.session_state.
|
65 |
+
#
|
66 |
+
# The current implementation uses:
|
67 |
+
# 1. A dedicated close_document() callback function triggered by the button's on_click
|
68 |
+
# 2. A flag-based approach (close_clicked) to handle cleanup on the next run cycle
|
69 |
+
# 3. Early cleanup detection and st.rerun() to ensure clean UI rendering
|
70 |
+
#
|
71 |
+
# Previous approaches using direct state manipulation or conditional rendering based on
|
72 |
+
# reset flags led to persistent UI elements and resource leaks.
|
73 |
+
#
|
74 |
+
# Consult https://docs.streamlit.io/library/advanced-features/session-state for details.
|
75 |
+
# ========================================================================================
|
76 |
+
|
77 |
+
def init_session_state():
|
78 |
+
"""Initialize session state variables if they don't already exist
|
79 |
+
|
80 |
+
This function follows Streamlit's recommended patterns for state initialization.
|
81 |
+
It only creates variables if they don't exist yet and doesn't modify existing values.
|
82 |
+
"""
|
83 |
if 'previous_results' not in st.session_state:
|
84 |
st.session_state.previous_results = []
|
|
|
|
|
85 |
if 'temp_file_paths' not in st.session_state:
|
86 |
st.session_state.temp_file_paths = []
|
|
|
|
|
87 |
if 'last_processed_file' not in st.session_state:
|
88 |
st.session_state.last_processed_file = None
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
if 'auto_process_sample' not in st.session_state:
|
90 |
st.session_state.auto_process_sample = False
|
91 |
if 'sample_just_loaded' not in st.session_state:
|
|
|
102 |
st.session_state.original_sample_name = None
|
103 |
if 'is_sample_document' not in st.session_state:
|
104 |
st.session_state.is_sample_document = False
|
105 |
+
if 'selected_previous_result' not in st.session_state:
|
106 |
+
st.session_state.selected_previous_result = None
|
107 |
+
if 'close_clicked' not in st.session_state:
|
108 |
+
st.session_state.close_clicked = False
|
109 |
+
if 'active_tab' not in st.session_state:
|
110 |
+
st.session_state.active_tab = 0
|
111 |
+
|
112 |
+
def close_document():
|
113 |
+
"""Called when the Close Document button is clicked
|
114 |
|
115 |
+
This function handles proper cleanup of resources and state when closing a document.
|
116 |
+
It uses Streamlit's callback mechanism which ensures the state change happens
|
117 |
+
at the correct time in Streamlit's execution cycle.
|
118 |
+
|
119 |
+
WARNING: Do not replace this with inline button handling using if st.button():
|
120 |
+
That approach breaks Streamlit's execution flow and causes UI artifacts.
|
121 |
+
"""
|
122 |
+
logger.info("Close document button clicked")
|
123 |
+
# Save the previous results
|
124 |
+
previous_results = st.session_state.previous_results if 'previous_results' in st.session_state else []
|
125 |
+
|
126 |
+
# Clean up temp files
|
127 |
+
if 'temp_file_paths' in st.session_state and st.session_state.temp_file_paths:
|
128 |
+
logger.info(f"Cleaning up {len(st.session_state.temp_file_paths)} temporary files")
|
129 |
+
handle_temp_files(st.session_state.temp_file_paths)
|
130 |
+
|
131 |
+
# Clear all state variables except previous_results
|
132 |
+
for key in list(st.session_state.keys()):
|
133 |
+
if key != 'previous_results' and key != 'close_clicked':
|
134 |
+
st.session_state.pop(key, None)
|
135 |
+
|
136 |
+
# Set flag for having cleaned up
|
137 |
+
st.session_state.close_clicked = True
|
138 |
+
|
139 |
+
# Restore the previous results
|
140 |
+
st.session_state.previous_results = previous_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
def show_example_documents():
|
143 |
"""Show example documents section"""
|
144 |
+
st.header("Sample Documents")
|
145 |
|
146 |
+
# Add a simplified info message about examples and CSS in the same markdown block
|
147 |
+
# to reduce spacing between elements
|
148 |
st.markdown("""
|
149 |
This app can process various historical documents:
|
150 |
- Historical photographs, maps, and manuscripts
|
151 |
- Handwritten letters and documents
|
152 |
- Printed books and articles
|
153 |
- Multi-page PDFs
|
|
|
154 |
|
|
|
|
|
155 |
<style>
|
156 |
/* Make the selectbox container match the full column width */
|
157 |
.main .block-container .element-container:has([data-testid="stSelectbox"]) {
|
158 |
width: 100% !important;
|
159 |
max-width: 100% !important;
|
160 |
+
margin-top: -12px !important; /* Reduce space between text and selectbox */
|
161 |
}
|
162 |
|
163 |
/* Make the actual selectbox control take the full width */
|
|
|
165 |
width: 100% !important;
|
166 |
max-width: 100% !important;
|
167 |
}
|
168 |
+
|
169 |
+
/* Tighten spacing in the sample documents tab */
|
170 |
+
.main .block-container [data-testid="stVerticalBlock"] > div:nth-child(n+2) {
|
171 |
+
margin-top: 0.5rem !important;
|
172 |
+
}
|
173 |
</style>
|
174 |
""", unsafe_allow_html=True)
|
175 |
|
|
|
181 |
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
|
182 |
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
|
183 |
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
|
|
|
184 |
]
|
185 |
|
186 |
sample_names = [
|
|
|
189 |
"The Magician (Image)",
|
190 |
"Handwritten Letter (Image)",
|
191 |
"Magellan Travels (Image)",
|
192 |
+
"Milgram Flier (Image)"
|
193 |
+
]
|
|
|
194 |
|
195 |
# Initialize sample_document in session state if it doesn't exist
|
196 |
if 'sample_document' not in st.session_state:
|
|
|
201 |
if selected_sample > 0:
|
202 |
selected_url = sample_urls[selected_sample]
|
203 |
|
204 |
+
# Add process button for the sample document with consistent styling
|
205 |
+
if st.button("Load Sample Document", key="load_sample_btn"):
|
206 |
try:
|
207 |
import requests
|
208 |
from io import BytesIO
|
|
|
267 |
content_type=content_type
|
268 |
)
|
269 |
|
270 |
+
# Store original bytes for reprocessing with proper MIME type handling
|
271 |
st.session_state.original_sample_bytes = response.content
|
272 |
st.session_state.original_sample_name = file_name
|
273 |
+
st.session_state.original_sample_mime_type = content_type
|
274 |
|
275 |
# Set state flags
|
276 |
st.session_state.sample_just_loaded = True
|
|
|
278 |
# Generate a unique identifier for the sample document
|
279 |
st.session_state.last_processed_file = f"{file_name}_{len(response.content)}"
|
280 |
|
281 |
+
# Set a flag to show redirect message
|
282 |
+
st.session_state.redirect_to_processing = True
|
283 |
st.rerun()
|
284 |
except Exception as e:
|
285 |
st.error(f"Error downloading sample document: {str(e)}")
|
|
|
303 |
|
304 |
# Check if this is a new file (different from the last processed file)
|
305 |
current_file_identifier = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
|
306 |
+
|
307 |
+
# Make sure last_processed_file is initialized
|
308 |
+
if 'last_processed_file' not in st.session_state:
|
309 |
+
st.session_state.last_processed_file = None
|
310 |
+
|
311 |
if st.session_state.last_processed_file != current_file_identifier:
|
312 |
# Reset processed_document_active if a new file is uploaded
|
313 |
st.session_state.processed_document_active = False
|
314 |
|
315 |
# Process button - flush left with similar padding as file browser
|
316 |
with left_col:
|
317 |
+
# Create a process button with minimal spacing to the uploader
|
318 |
+
st.markdown('<div style="padding: 0.2rem 0; min-width: 170px; margin-top: -10px; overflow: visible;">', unsafe_allow_html=True)
|
319 |
+
process_button = st.button("Process Document", key="process_document_btn")
|
320 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
321 |
|
322 |
# Handle sample document recreation if needed
|
323 |
if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
|
|
|
349 |
# Positioned right after the process button for better visibility
|
350 |
progress_placeholder = st.empty()
|
351 |
|
352 |
+
# Image preprocessing preview - show if image file and preprocessing options are set
|
353 |
+
if (any(sidebar_options["preprocessing_options"].values()) and
|
354 |
+
uploaded_file.type.startswith('image/')):
|
355 |
+
|
356 |
st.markdown("**Preprocessed Preview**")
|
357 |
try:
|
358 |
+
# Create a container for the preview
|
359 |
with st.container():
|
360 |
processed_bytes = preprocess_image(uploaded_file.getvalue(), sidebar_options["preprocessing_options"])
|
361 |
+
# Convert image to base64 and display as HTML to avoid fullscreen button
|
362 |
+
img_data = base64.b64encode(processed_bytes).decode()
|
363 |
+
img_html = f'<img src="data:image/jpeg;base64,{img_data}" style="width:100%; border-radius:4px;">'
|
364 |
+
st.markdown(img_html, unsafe_allow_html=True)
|
365 |
+
|
366 |
+
# Show preprocessing metadata in a well-formatted caption
|
367 |
+
meta_items = []
|
368 |
+
if sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
|
369 |
+
meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
|
370 |
+
if sidebar_options["preprocessing_options"].get("grayscale", False):
|
371 |
+
meta_items.append("Grayscale")
|
372 |
+
if sidebar_options["preprocessing_options"].get("denoise", False):
|
373 |
+
meta_items.append("Denoise")
|
374 |
+
if sidebar_options["preprocessing_options"].get("contrast", 0) != 0:
|
375 |
+
meta_items.append(f"Contrast ({sidebar_options['preprocessing_options']['contrast']})")
|
376 |
+
if sidebar_options["preprocessing_options"].get("rotation", 0) != 0:
|
377 |
+
meta_items.append(f"Rotation ({sidebar_options['preprocessing_options']['rotation']}°)")
|
378 |
+
|
379 |
+
# Only show "Applied:" if there are actual preprocessing steps
|
380 |
+
if meta_items:
|
381 |
+
meta_text = "Applied: " + ", ".join(meta_items)
|
382 |
+
st.caption(meta_text)
|
383 |
except Exception as e:
|
384 |
st.error(f"Error in preprocessing: {str(e)}")
|
385 |
st.info("Try using grayscale preprocessing for PNG images with transparency")
|
386 |
|
387 |
# Container for success message (will be filled after processing)
|
|
|
388 |
metadata_placeholder = st.empty()
|
389 |
|
390 |
# Check if this is an auto-processing situation
|
|
|
408 |
progress_reporter = ProgressReporter(progress_placeholder).setup()
|
409 |
|
410 |
try:
|
411 |
+
# Process the document, capturing both result and temp file paths
|
412 |
+
# Modified to pass existing temp_file_paths to avoid resource leaks
|
413 |
+
existing_temp_paths = []
|
414 |
+
if 'temp_file_paths' in st.session_state:
|
415 |
+
existing_temp_paths = st.session_state.temp_file_paths
|
416 |
+
|
417 |
result = process_file(
|
418 |
uploaded_file=uploaded_file,
|
419 |
use_vision=sidebar_options["use_vision"],
|
|
|
426 |
perf_mode=sidebar_options.get("perf_mode", "Quality")
|
427 |
)
|
428 |
|
429 |
+
# Ensure temp_file_paths in session state is updated with any new paths
|
430 |
+
# This is critical for proper resource cleanup when document is closed
|
431 |
+
if 'has_images' in result and result['has_images']:
|
432 |
+
logger.info("Document has images, ensuring temp files are tracked")
|
433 |
+
if 'temp_file_paths' not in st.session_state:
|
434 |
+
st.session_state.temp_file_paths = []
|
435 |
+
|
436 |
+
# Handle text-only OCR results (like the Milgram flier)
|
437 |
+
if ('ocr_contents' in result and
|
438 |
+
'raw_text' in result['ocr_contents'] and
|
439 |
+
len(result['ocr_contents']) <= 2 and # Only raw_text and possibly one other field
|
440 |
+
'has_images' not in result):
|
441 |
+
logger.info("Text-only OCR detected, handling as special case")
|
442 |
+
# Ensure raw_text is properly formatted as markdown
|
443 |
+
raw_text = result['ocr_contents']['raw_text']
|
444 |
+
# If we don't have other structured content, set a placeholder title
|
445 |
+
if 'title' not in result['ocr_contents']:
|
446 |
+
result['ocr_contents']['title'] = "Document Text"
|
447 |
+
|
448 |
+
# Display success message at the top of results, before any previews
|
449 |
+
with left_col:
|
450 |
+
# First show the success message (full width)
|
451 |
+
st.success("**Document processed successfully**")
|
452 |
+
|
453 |
+
# Then show the close button (also full width, positioned to left)
|
454 |
+
st.button("Close Document",
|
455 |
+
key="close_document_btn",
|
456 |
+
type="secondary",
|
457 |
+
on_click=close_document)
|
458 |
+
|
459 |
+
# Add a small spacer
|
460 |
+
st.markdown("<div style='height: 10px;'></div>", unsafe_allow_html=True)
|
461 |
+
|
462 |
# Display results
|
463 |
display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
|
464 |
|
|
|
472 |
if uploaded_file is not None:
|
473 |
st.session_state.last_processed_file = current_file_identifier
|
474 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
# Store the result in the previous results list
|
476 |
# Add timestamp to result for history tracking
|
477 |
result_copy = result.copy()
|
|
|
496 |
def main():
|
497 |
"""Main application function"""
|
498 |
# Initialize session state
|
499 |
+
init_session_state()
|
500 |
+
|
501 |
+
# Handle any required cleanup at the start of execution
|
502 |
+
# CRITICAL: This two-phase state cleanup pattern is essential for Streamlit's execution model.
|
503 |
+
# When close_clicked is True, we need to restart the app's execution with a clean slate.
|
504 |
+
# DO NOT REMOVE OR MODIFY this pattern as it ensures proper UI cleanup.
|
505 |
+
if st.session_state.get('close_clicked', False):
|
506 |
+
# Reset the flag - cleanup has been handled
|
507 |
+
st.session_state.close_clicked = False
|
508 |
+
# Don't do anything else in this run - force a clean restart
|
509 |
+
st.rerun()
|
510 |
+
|
511 |
+
# Initialize new flag for redirecting to processing tab
|
512 |
+
if 'redirect_to_processing' not in st.session_state:
|
513 |
+
st.session_state.redirect_to_processing = False
|
514 |
|
515 |
# Apply custom CSS
|
516 |
from ui.layout import load_css
|
|
|
519 |
# Create sidebar options
|
520 |
sidebar_options = create_sidebar_options()
|
521 |
|
522 |
+
# Create main layout with tabs - simpler, more compact approach
|
523 |
+
tab_names = ["Document Processing", "Sample Documents", "Previous Results", "About"]
|
524 |
+
main_tab1, main_tab2, main_tab3, main_tab4 = st.tabs(tab_names)
|
525 |
|
526 |
with main_tab1:
|
527 |
+
# Create a two-column layout for file upload and results with minimal padding
|
528 |
+
st.markdown('<style>.block-container{padding-top: 1rem; padding-bottom: 0;}</style>', unsafe_allow_html=True)
|
529 |
left_col, right_col = st.columns([1, 1])
|
530 |
|
531 |
with left_col:
|
532 |
# Create file uploader
|
533 |
uploaded_file = create_file_uploader()
|
534 |
|
535 |
+
# If a real file is uploaded, clear any sample document
|
536 |
+
if uploaded_file is not None and 'sample_document' in st.session_state:
|
537 |
+
st.session_state.sample_document = None
|
538 |
+
st.session_state.is_sample_document = False
|
539 |
+
|
540 |
+
# Check if we have a sample document loaded (only if no real file uploaded)
|
541 |
+
elif ('sample_document' in st.session_state and
|
542 |
st.session_state.sample_document is not None):
|
543 |
|
544 |
# Use the sample document instead of the uploaded file
|
|
|
553 |
# Only process document if available
|
554 |
if uploaded_file is not None:
|
555 |
process_document(uploaded_file, left_col, right_col, sidebar_options)
|
|
|
|
|
|
|
|
|
|
|
|
|
556 |
|
557 |
with main_tab2:
|
558 |
+
# Sample Documents tab
|
559 |
+
|
560 |
+
# Show redirect message if a sample was just loaded
|
561 |
+
if st.session_state.get('redirect_to_processing', False):
|
562 |
+
st.success("**Sample document loaded!** Please switch to the **Document Processing** tab to view and process it.")
|
563 |
+
# Clear the flag after showing the message
|
564 |
+
st.session_state.redirect_to_processing = False
|
565 |
+
|
566 |
+
show_example_documents()
|
567 |
+
|
568 |
+
with main_tab3:
|
569 |
# Previous results tab
|
570 |
display_previous_results()
|
571 |
|
572 |
+
with main_tab4:
|
573 |
# About tab
|
574 |
display_about_tab()
|
575 |
|
ocr_processing.py
CHANGED
@@ -1,22 +1,28 @@
|
|
|
|
1 |
import os
|
2 |
import hashlib
|
3 |
import tempfile
|
4 |
-
import streamlit as st
|
5 |
import logging
|
6 |
import time
|
7 |
from datetime import datetime
|
8 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from structured_ocr import StructuredOCR
|
10 |
from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
|
11 |
from preprocessing import apply_preprocessing_to_file
|
12 |
from error_handler import handle_ocr_error, check_file_size
|
13 |
|
14 |
-
# Configure logging
|
15 |
-
logger = logging.getLogger("ocr_processing")
|
16 |
-
logger.setLevel(logging.INFO)
|
17 |
-
|
18 |
@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
|
19 |
-
def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None):
|
20 |
"""
|
21 |
Cached version of OCR processing to reuse results
|
22 |
|
@@ -27,6 +33,7 @@ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_ke
|
|
27 |
file_size_mb: File size in MB
|
28 |
cache_key: Cache key for the file
|
29 |
preprocessing_options_hash: Hash of preprocessing options
|
|
|
30 |
|
31 |
Returns:
|
32 |
dict: OCR result
|
@@ -40,7 +47,8 @@ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_ke
|
|
40 |
file_path,
|
41 |
file_type=file_type,
|
42 |
use_vision=use_vision,
|
43 |
-
file_size_mb=file_size_mb
|
|
|
44 |
)
|
45 |
|
46 |
return result
|
@@ -75,6 +83,10 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
75 |
# Initialize temporary file paths list
|
76 |
temp_file_paths = []
|
77 |
|
|
|
|
|
|
|
|
|
78 |
try:
|
79 |
# Check if file size exceeds maximum allowed size
|
80 |
is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
|
@@ -113,6 +125,11 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
113 |
f.write(file_bytes)
|
114 |
temp_file_paths.append(temp_path)
|
115 |
|
|
|
|
|
|
|
|
|
|
|
116 |
# Generate cache key
|
117 |
cache_key = generate_cache_key(
|
118 |
file_bytes,
|
@@ -125,7 +142,43 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
125 |
|
126 |
# Process with cached function if possible
|
127 |
try:
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
progress_reporter.update(90, "Finalizing results...")
|
130 |
except Exception as e:
|
131 |
logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
|
@@ -134,18 +187,28 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
134 |
# If caching fails, process directly
|
135 |
processor = StructuredOCR()
|
136 |
|
137 |
-
|
138 |
-
if
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
# Process directly with optimized settings
|
144 |
result = processor.process_file(
|
145 |
file_path=temp_path,
|
146 |
file_type="pdf",
|
147 |
use_vision=use_vision,
|
148 |
-
custom_prompt=
|
149 |
file_size_mb=file_size_mb,
|
150 |
pdf_rotation=pdf_rotation
|
151 |
)
|
@@ -179,7 +242,37 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
179 |
# Process the file using cached function if possible
|
180 |
progress_reporter.update(50, "Processing document with OCR...")
|
181 |
try:
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
progress_reporter.update(80, "Analyzing document structure...")
|
184 |
progress_reporter.update(90, "Finalizing results...")
|
185 |
except Exception as e:
|
@@ -194,11 +287,30 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
|
|
194 |
# Use simpler processing for speed
|
195 |
pass # Any speed optimizations would be handled by the StructuredOCR class
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
result = processor.process_file(
|
198 |
file_path=temp_path,
|
199 |
file_type=file_type,
|
200 |
use_vision=use_vision,
|
201 |
-
custom_prompt=
|
202 |
file_size_mb=file_size_mb
|
203 |
)
|
204 |
|
|
|
1 |
+
# Standard library imports
|
2 |
import os
|
3 |
import hashlib
|
4 |
import tempfile
|
|
|
5 |
import logging
|
6 |
import time
|
7 |
from datetime import datetime
|
8 |
from pathlib import Path
|
9 |
+
|
10 |
+
# Configure logging
|
11 |
+
logging.basicConfig(level=logging.INFO,
|
12 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
# Third-party imports
|
16 |
+
import streamlit as st
|
17 |
+
|
18 |
+
# Local application imports
|
19 |
from structured_ocr import StructuredOCR
|
20 |
from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
|
21 |
from preprocessing import apply_preprocessing_to_file
|
22 |
from error_handler import handle_ocr_error, check_file_size
|
23 |
|
|
|
|
|
|
|
|
|
24 |
@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
|
25 |
+
def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None, custom_prompt=None):
|
26 |
"""
|
27 |
Cached version of OCR processing to reuse results
|
28 |
|
|
|
33 |
file_size_mb: File size in MB
|
34 |
cache_key: Cache key for the file
|
35 |
preprocessing_options_hash: Hash of preprocessing options
|
36 |
+
custom_prompt: Custom prompt to use for OCR
|
37 |
|
38 |
Returns:
|
39 |
dict: OCR result
|
|
|
47 |
file_path,
|
48 |
file_type=file_type,
|
49 |
use_vision=use_vision,
|
50 |
+
file_size_mb=file_size_mb,
|
51 |
+
custom_prompt=custom_prompt
|
52 |
)
|
53 |
|
54 |
return result
|
|
|
83 |
# Initialize temporary file paths list
|
84 |
temp_file_paths = []
|
85 |
|
86 |
+
# Also track temporary files in session state for reliable cleanup
|
87 |
+
if 'temp_file_paths' not in st.session_state:
|
88 |
+
st.session_state.temp_file_paths = []
|
89 |
+
|
90 |
try:
|
91 |
# Check if file size exceeds maximum allowed size
|
92 |
is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
|
|
|
125 |
f.write(file_bytes)
|
126 |
temp_file_paths.append(temp_path)
|
127 |
|
128 |
+
# Track temp files in session state for reliable cleanup
|
129 |
+
if temp_path not in st.session_state.temp_file_paths:
|
130 |
+
st.session_state.temp_file_paths.append(temp_path)
|
131 |
+
logger.info(f"Added temp file to session state: {temp_path}")
|
132 |
+
|
133 |
# Generate cache key
|
134 |
cache_key = generate_cache_key(
|
135 |
file_bytes,
|
|
|
142 |
|
143 |
# Process with cached function if possible
|
144 |
try:
|
145 |
+
# Check if preprocessing options indicate a handwritten document
|
146 |
+
handwritten_document = preprocessing_options.get("document_type") == "handwritten"
|
147 |
+
modified_custom_prompt = custom_prompt
|
148 |
+
|
149 |
+
# Add handwritten specific instructions if needed
|
150 |
+
if handwritten_document and modified_custom_prompt:
|
151 |
+
if "handwritten" not in modified_custom_prompt.lower():
|
152 |
+
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
153 |
+
elif handwritten_document and not modified_custom_prompt:
|
154 |
+
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
155 |
+
|
156 |
+
# Add PDF-specific instructions if needed
|
157 |
+
if modified_custom_prompt and "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
|
158 |
+
modified_custom_prompt += " This is a multi-page PDF document."
|
159 |
+
elif not modified_custom_prompt:
|
160 |
+
modified_custom_prompt = "This is a multi-page PDF document."
|
161 |
+
|
162 |
+
# For certain filenames, explicitly add document type hints
|
163 |
+
filename_lower = uploaded_file.name.lower()
|
164 |
+
if "handwritten" in filename_lower or "letter" in filename_lower or "journal" in filename_lower:
|
165 |
+
if not modified_custom_prompt:
|
166 |
+
modified_custom_prompt = "This is a handwritten document in PDF format. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
167 |
+
elif "handwritten" not in modified_custom_prompt.lower():
|
168 |
+
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text."
|
169 |
+
|
170 |
+
# Update the cache key with the modified prompt
|
171 |
+
if modified_custom_prompt != custom_prompt:
|
172 |
+
cache_key = generate_cache_key(
|
173 |
+
open(temp_path, 'rb').read(),
|
174 |
+
file_type,
|
175 |
+
use_vision,
|
176 |
+
preprocessing_options,
|
177 |
+
pdf_rotation,
|
178 |
+
modified_custom_prompt
|
179 |
+
)
|
180 |
+
|
181 |
+
result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options), modified_custom_prompt)
|
182 |
progress_reporter.update(90, "Finalizing results...")
|
183 |
except Exception as e:
|
184 |
logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
|
|
|
187 |
# If caching fails, process directly
|
188 |
processor = StructuredOCR()
|
189 |
|
190 |
+
|
191 |
+
# Check if preprocessing options indicate a handwritten document
|
192 |
+
handwritten_document = preprocessing_options.get("document_type") == "handwritten"
|
193 |
+
modified_custom_prompt = custom_prompt
|
194 |
+
|
195 |
+
# Add handwritten specific instructions if needed
|
196 |
+
if handwritten_document and modified_custom_prompt:
|
197 |
+
if "handwritten" not in modified_custom_prompt.lower():
|
198 |
+
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
199 |
+
elif handwritten_document and not modified_custom_prompt:
|
200 |
+
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
201 |
+
|
202 |
+
# Add PDF-specific instructions if needed
|
203 |
+
if custom_prompt and "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
|
204 |
+
modified_custom_prompt += " This is a multi-page PDF document."
|
205 |
|
206 |
# Process directly with optimized settings
|
207 |
result = processor.process_file(
|
208 |
file_path=temp_path,
|
209 |
file_type="pdf",
|
210 |
use_vision=use_vision,
|
211 |
+
custom_prompt=modified_custom_prompt,
|
212 |
file_size_mb=file_size_mb,
|
213 |
pdf_rotation=pdf_rotation
|
214 |
)
|
|
|
242 |
# Process the file using cached function if possible
|
243 |
progress_reporter.update(50, "Processing document with OCR...")
|
244 |
try:
|
245 |
+
# Check if preprocessing options indicate a handwritten document
|
246 |
+
handwritten_document = preprocessing_options.get("document_type") == "handwritten"
|
247 |
+
modified_custom_prompt = custom_prompt
|
248 |
+
|
249 |
+
# Add handwritten specific instructions if needed
|
250 |
+
if handwritten_document and modified_custom_prompt:
|
251 |
+
if "handwritten" not in modified_custom_prompt.lower():
|
252 |
+
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
253 |
+
elif handwritten_document and not modified_custom_prompt:
|
254 |
+
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
255 |
+
|
256 |
+
# For certain filenames, explicitly add document type hints
|
257 |
+
filename_lower = uploaded_file.name.lower()
|
258 |
+
if "handwritten" in filename_lower or "letter" in filename_lower or "journal" in filename_lower:
|
259 |
+
if not modified_custom_prompt:
|
260 |
+
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
261 |
+
elif "handwritten" not in modified_custom_prompt.lower():
|
262 |
+
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text."
|
263 |
+
|
264 |
+
# Update the cache key with the modified prompt
|
265 |
+
if modified_custom_prompt != custom_prompt:
|
266 |
+
cache_key = generate_cache_key(
|
267 |
+
open(temp_path, 'rb').read(),
|
268 |
+
file_type,
|
269 |
+
use_vision,
|
270 |
+
preprocessing_options,
|
271 |
+
0,
|
272 |
+
modified_custom_prompt
|
273 |
+
)
|
274 |
+
|
275 |
+
result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options), modified_custom_prompt)
|
276 |
progress_reporter.update(80, "Analyzing document structure...")
|
277 |
progress_reporter.update(90, "Finalizing results...")
|
278 |
except Exception as e:
|
|
|
287 |
# Use simpler processing for speed
|
288 |
pass # Any speed optimizations would be handled by the StructuredOCR class
|
289 |
|
290 |
+
# Check if preprocessing options indicate a handwritten document
|
291 |
+
handwritten_document = preprocessing_options.get("document_type") == "handwritten"
|
292 |
+
modified_custom_prompt = custom_prompt
|
293 |
+
|
294 |
+
# Add handwritten specific instructions if needed
|
295 |
+
if handwritten_document and modified_custom_prompt:
|
296 |
+
if "handwritten" not in modified_custom_prompt.lower():
|
297 |
+
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
298 |
+
elif handwritten_document and not modified_custom_prompt:
|
299 |
+
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
300 |
+
|
301 |
+
# For certain filenames, explicitly add document type hints
|
302 |
+
filename_lower = uploaded_file.name.lower()
|
303 |
+
if "handwritten" in filename_lower or "letter" in filename_lower or "journal" in filename_lower:
|
304 |
+
if not modified_custom_prompt:
|
305 |
+
modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
|
306 |
+
elif "handwritten" not in modified_custom_prompt.lower():
|
307 |
+
modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text."
|
308 |
+
|
309 |
result = processor.process_file(
|
310 |
file_path=temp_path,
|
311 |
file_type=file_type,
|
312 |
use_vision=use_vision,
|
313 |
+
custom_prompt=modified_custom_prompt,
|
314 |
file_size_mb=file_size_mb
|
315 |
)
|
316 |
|
ocr_utils.py
CHANGED
@@ -3,12 +3,12 @@ Utility functions for OCR processing with Mistral AI.
|
|
3 |
Contains helper functions for working with OCR responses and image handling.
|
4 |
"""
|
5 |
|
|
|
6 |
import json
|
7 |
import base64
|
8 |
import io
|
9 |
import zipfile
|
10 |
import logging
|
11 |
-
import numpy as np
|
12 |
import time
|
13 |
from datetime import datetime
|
14 |
from pathlib import Path
|
@@ -16,20 +16,29 @@ from typing import Dict, List, Optional, Union, Any, Tuple
|
|
16 |
from functools import lru_cache
|
17 |
|
18 |
# Configure logging
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
20 |
|
|
|
21 |
try:
|
22 |
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
|
23 |
-
import cv2
|
24 |
PILLOW_AVAILABLE = True
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
CV2_AVAILABLE = True
|
26 |
-
except ImportError
|
27 |
-
|
28 |
-
|
29 |
-
PILLOW_AVAILABLE = False
|
30 |
-
if "cv2" in str(e):
|
31 |
-
CV2_AVAILABLE = False
|
32 |
|
|
|
33 |
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
34 |
from mistralai.models import OCRImageObject
|
35 |
|
@@ -110,9 +119,36 @@ def encode_image_for_api(image_path: Union[str, Path]) -> str:
|
|
110 |
if not image_file.is_file():
|
111 |
raise FileNotFoundError(f"Image file not found: {image_file}")
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
# Encode image as base64
|
114 |
encoded = base64.b64encode(image_file.read_bytes()).decode()
|
115 |
-
return f"data:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
|
118 |
"""
|
@@ -509,7 +545,7 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
509 |
aspect_ratio = width / height
|
510 |
|
511 |
# Newspaper-style documents typically have width > height or are very large
|
512 |
-
is_newspaper_format = (aspect_ratio > 1.
|
513 |
|
514 |
if is_newspaper_format:
|
515 |
logger.info(f"Newspaper format detected: {width}x{height}, applying specialized processing")
|
@@ -560,7 +596,7 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
560 |
if is_document:
|
561 |
# Newspapers typically have wide formats or very large dimensions
|
562 |
aspect_ratio = width / height
|
563 |
-
is_newspaper = (aspect_ratio > 1.
|
564 |
|
565 |
logger.debug(f"Document type detection for {image_file.name}: " +
|
566 |
f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
|
@@ -712,6 +748,7 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
712 |
|
713 |
# Get base64 with minimal memory footprint
|
714 |
encoded_image = base64.b64encode(buffer.getvalue()).decode()
|
|
|
715 |
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
716 |
|
717 |
# Update cache thread-safely
|
@@ -932,7 +969,7 @@ def _preprocess_document_image_impl() -> Image.Image:
|
|
932 |
|
933 |
# Check for newspaper format first (takes precedence)
|
934 |
aspect_ratio = width / height
|
935 |
-
if (aspect_ratio > 1.
|
936 |
is_newspaper = True
|
937 |
logger.debug(f"Newspaper format detected: {width}x{height}, aspect ratio: {aspect_ratio:.2f}")
|
938 |
else:
|
|
|
3 |
Contains helper functions for working with OCR responses and image handling.
|
4 |
"""
|
5 |
|
6 |
+
# Standard library imports
|
7 |
import json
|
8 |
import base64
|
9 |
import io
|
10 |
import zipfile
|
11 |
import logging
|
|
|
12 |
import time
|
13 |
from datetime import datetime
|
14 |
from pathlib import Path
|
|
|
16 |
from functools import lru_cache
|
17 |
|
18 |
# Configure logging
|
19 |
+
logging.basicConfig(level=logging.INFO,
|
20 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
21 |
+
logger = logging.getLogger(__name__)
|
22 |
+
|
23 |
+
# Third-party imports
|
24 |
+
import numpy as np
|
25 |
|
26 |
+
# Check for image processing libraries
|
27 |
try:
|
28 |
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
|
|
|
29 |
PILLOW_AVAILABLE = True
|
30 |
+
except ImportError:
|
31 |
+
logger.warning("PIL not available - image preprocessing will be limited")
|
32 |
+
PILLOW_AVAILABLE = False
|
33 |
+
|
34 |
+
try:
|
35 |
+
import cv2
|
36 |
CV2_AVAILABLE = True
|
37 |
+
except ImportError:
|
38 |
+
logger.warning("OpenCV (cv2) not available - advanced image processing will be limited")
|
39 |
+
CV2_AVAILABLE = False
|
|
|
|
|
|
|
40 |
|
41 |
+
# Mistral AI imports
|
42 |
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
43 |
from mistralai.models import OCRImageObject
|
44 |
|
|
|
119 |
if not image_file.is_file():
|
120 |
raise FileNotFoundError(f"Image file not found: {image_file}")
|
121 |
|
122 |
+
# Determine mime type based on file extension
|
123 |
+
mime_type = 'image/jpeg' # Default mime type
|
124 |
+
suffix = image_file.suffix.lower()
|
125 |
+
if suffix == '.png':
|
126 |
+
mime_type = 'image/png'
|
127 |
+
elif suffix == '.gif':
|
128 |
+
mime_type = 'image/gif'
|
129 |
+
elif suffix in ['.jpg', '.jpeg']:
|
130 |
+
mime_type = 'image/jpeg'
|
131 |
+
elif suffix == '.pdf':
|
132 |
+
mime_type = 'application/pdf'
|
133 |
+
|
134 |
# Encode image as base64
|
135 |
encoded = base64.b64encode(image_file.read_bytes()).decode()
|
136 |
+
return f"data:{mime_type};base64,{encoded}"
|
137 |
+
|
138 |
+
def encode_bytes_for_api(file_bytes: bytes, mime_type: str) -> str:
|
139 |
+
"""
|
140 |
+
Encode binary data as base64 data URL for API submission.
|
141 |
+
|
142 |
+
Args:
|
143 |
+
file_bytes: Binary file data
|
144 |
+
mime_type: MIME type of the file (e.g., 'image/jpeg', 'application/pdf')
|
145 |
+
|
146 |
+
Returns:
|
147 |
+
Base64 data URL for the data
|
148 |
+
"""
|
149 |
+
# Encode data as base64
|
150 |
+
encoded = base64.b64encode(file_bytes).decode()
|
151 |
+
return f"data:{mime_type};base64,{encoded}"
|
152 |
|
153 |
def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
|
154 |
"""
|
|
|
545 |
aspect_ratio = width / height
|
546 |
|
547 |
# Newspaper-style documents typically have width > height or are very large
|
548 |
+
is_newspaper_format = (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000)
|
549 |
|
550 |
if is_newspaper_format:
|
551 |
logger.info(f"Newspaper format detected: {width}x{height}, applying specialized processing")
|
|
|
596 |
if is_document:
|
597 |
# Newspapers typically have wide formats or very large dimensions
|
598 |
aspect_ratio = width / height
|
599 |
+
is_newspaper = (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000)
|
600 |
|
601 |
logger.debug(f"Document type detection for {image_file.name}: " +
|
602 |
f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
|
|
|
748 |
|
749 |
# Get base64 with minimal memory footprint
|
750 |
encoded_image = base64.b64encode(buffer.getvalue()).decode()
|
751 |
+
# Always use image/jpeg MIME type since we explicitly save as JPEG above
|
752 |
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
753 |
|
754 |
# Update cache thread-safely
|
|
|
969 |
|
970 |
# Check for newspaper format first (takes precedence)
|
971 |
aspect_ratio = width / height
|
972 |
+
if (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000):
|
973 |
is_newspaper = True
|
974 |
logger.debug(f"Newspaper format detected: {width}x{height}, aspect ratio: {aspect_ratio:.2f}")
|
975 |
else:
|
structured_ocr.py
CHANGED
@@ -1,24 +1,31 @@
|
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
import time
|
4 |
import random
|
5 |
-
from enum import Enum
|
6 |
-
from pathlib import Path
|
7 |
import json
|
8 |
import base64
|
9 |
import logging
|
|
|
|
|
10 |
from functools import lru_cache
|
11 |
from typing import Optional, Dict, Any, List, Union, Tuple
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# Try to import pycountry, provide fallback if not available
|
14 |
try:
|
15 |
import pycountry
|
16 |
PYCOUNTRY_AVAILABLE = True
|
17 |
except ImportError:
|
18 |
PYCOUNTRY_AVAILABLE = False
|
19 |
-
|
20 |
-
|
21 |
-
from pydantic import BaseModel
|
22 |
|
23 |
# Try to import Mistral AI, provide fallback if not available
|
24 |
try:
|
@@ -28,11 +35,7 @@ try:
|
|
28 |
MISTRAL_AVAILABLE = True
|
29 |
except ImportError:
|
30 |
MISTRAL_AVAILABLE = False
|
31 |
-
|
32 |
-
|
33 |
-
# Configure logging
|
34 |
-
logging.basicConfig(level=logging.INFO,
|
35 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
36 |
|
37 |
# Import utilities for OCR processing
|
38 |
try:
|
@@ -216,6 +219,12 @@ class StructuredOCR:
|
|
216 |
if file_type is None:
|
217 |
suffix = file_path.suffix.lower()
|
218 |
file_type = "pdf" if suffix == ".pdf" else "image"
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
# Get file size if not provided
|
221 |
if file_size_mb is None and file_path.exists():
|
@@ -437,6 +446,7 @@ class StructuredOCR:
|
|
437 |
# Convert only the selected pages to minimize memory usage
|
438 |
selected_images = []
|
439 |
combined_text = []
|
|
|
440 |
|
441 |
# Process pages in larger batches for better efficiency
|
442 |
batch_size = 5 # Process 5 pages at a time for better throughput
|
@@ -472,6 +482,11 @@ class StructuredOCR:
|
|
472 |
# Add page text to combined text without obvious page markers
|
473 |
page_text = page_result['ocr_contents']['raw_text']
|
474 |
combined_text.append(f"{page_text}")
|
|
|
|
|
|
|
|
|
|
|
475 |
except Exception as page_e:
|
476 |
logger.warning(f"Error processing page {page_num}: {str(page_e)}")
|
477 |
# Clean up temp file
|
@@ -509,28 +524,7 @@ class StructuredOCR:
|
|
509 |
# Add flag to indicate custom prompt was applied
|
510 |
result['custom_prompt_applied'] = 'text_only'
|
511 |
|
512 |
-
#
|
513 |
-
if custom_prompt:
|
514 |
-
# Extract document type if specified
|
515 |
-
doc_type = "general"
|
516 |
-
if "DOCUMENT TYPE:" in custom_prompt:
|
517 |
-
doc_type_line = custom_prompt.split("\n")[0]
|
518 |
-
if "DOCUMENT TYPE:" in doc_type_line:
|
519 |
-
doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
|
520 |
-
# Keyword-based detection as fallback
|
521 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
|
522 |
-
doc_type = "newspaper"
|
523 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
|
524 |
-
doc_type = "letter"
|
525 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
|
526 |
-
doc_type = "book"
|
527 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
|
528 |
-
doc_type = "form"
|
529 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
|
530 |
-
doc_type = "recipe"
|
531 |
-
|
532 |
-
# Store detected document type in result
|
533 |
-
result['detected_document_type'] = doc_type
|
534 |
|
535 |
except Exception as e:
|
536 |
logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
|
@@ -544,6 +538,10 @@ class StructuredOCR:
|
|
544 |
if 'ocr_contents' in result:
|
545 |
result['ocr_contents']['raw_text'] = all_text
|
546 |
|
|
|
|
|
|
|
|
|
547 |
# Add PDF metadata
|
548 |
result['file_name'] = file_path.name
|
549 |
result['pdf_processing_method'] = 'pdf2image_optimized'
|
@@ -711,6 +709,24 @@ class StructuredOCR:
|
|
711 |
limited_pages = True
|
712 |
logger.info(f"Processing {len(pages_to_process)} pages out of {total_pages} total")
|
713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
714 |
# Calculate confidence score if available
|
715 |
try:
|
716 |
confidence_values = [page.confidence for page in pages_to_process if hasattr(page, 'confidence')]
|
@@ -733,6 +749,12 @@ class StructuredOCR:
|
|
733 |
if page_markdown.strip():
|
734 |
all_markdown.append(f"{page_markdown}")
|
735 |
|
|
|
|
|
|
|
|
|
|
|
|
|
736 |
# Join all pages with separation
|
737 |
combined_markdown = "\n\n".join(all_markdown)
|
738 |
|
@@ -766,6 +788,13 @@ class StructuredOCR:
|
|
766 |
combined_markdown, file_path.name, custom_prompt
|
767 |
)
|
768 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
769 |
# Add metadata about pages
|
770 |
if limited_pages:
|
771 |
result['limited_pages'] = {
|
@@ -927,24 +956,44 @@ class StructuredOCR:
|
|
927 |
"confidence_score": 0.0
|
928 |
}
|
929 |
|
930 |
-
# Check if this is likely a newspaper or document
|
931 |
is_likely_newspaper = False
|
|
|
|
|
932 |
newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
|
933 |
"chronicle", "post", "tribune", "news", "press", "gender"]
|
|
|
|
|
934 |
|
935 |
-
# Check filename for
|
936 |
filename_lower = file_path.name.lower()
|
937 |
-
|
|
|
|
|
938 |
if keyword in filename_lower:
|
939 |
-
|
940 |
-
logger.info(f"Likely
|
941 |
-
# Add
|
942 |
if custom_prompt:
|
943 |
-
if "
|
944 |
-
custom_prompt = custom_prompt + " This appears to be a
|
945 |
else:
|
946 |
-
custom_prompt = "This
|
947 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
948 |
|
949 |
try:
|
950 |
# Check file size
|
@@ -1017,21 +1066,24 @@ class StructuredOCR:
|
|
1017 |
logger.info(f"Resized image to {new_size_mb:.2f} MB")
|
1018 |
except ImportError:
|
1019 |
logger.warning("PIL not available for resizing. Using original image.")
|
1020 |
-
|
1021 |
-
|
|
|
1022 |
except Exception as e:
|
1023 |
logger.warning(f"Image resize failed: {str(e)}. Using original image.")
|
1024 |
-
|
1025 |
-
|
|
|
1026 |
else:
|
1027 |
-
# For smaller images, use as-is
|
1028 |
-
|
1029 |
-
base64_data_url =
|
1030 |
except Exception as e:
|
1031 |
# Fallback to original image if any preprocessing fails
|
1032 |
logger.warning(f"Image preprocessing failed: {str(e)}. Using original image.")
|
1033 |
-
|
1034 |
-
|
|
|
1035 |
|
1036 |
# Process the image with OCR
|
1037 |
logger.info(f"Processing image with OCR using {OCR_MODEL}")
|
@@ -1123,10 +1175,40 @@ class StructuredOCR:
|
|
1123 |
# Get the OCR markdown from the first page
|
1124 |
image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
|
1125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1126 |
# Optimize: Skip vision model step if ocr_markdown is very small or empty
|
1127 |
# BUT make an exception for newspapers or if custom_prompt is provided
|
1128 |
-
|
1129 |
-
|
|
|
1130 |
return {
|
1131 |
"file_name": file_path.name,
|
1132 |
"topics": ["Document"],
|
@@ -1134,7 +1216,9 @@ class StructuredOCR:
|
|
1134 |
"ocr_contents": {
|
1135 |
"raw_text": image_ocr_markdown if image_ocr_markdown else "No text could be extracted from the image."
|
1136 |
},
|
1137 |
-
"processing_note": "OCR produced minimal text content"
|
|
|
|
|
1138 |
}
|
1139 |
|
1140 |
# For newspapers with little text in OCR, set a more explicit prompt
|
@@ -1144,6 +1228,14 @@ class StructuredOCR:
|
|
1144 |
custom_prompt = "This is a newspaper or document with columns. The OCR may not have captured all text. Please examine the image carefully and extract ALL text content visible in the document, reading each column from top to bottom."
|
1145 |
elif "extract all text" not in custom_prompt.lower():
|
1146 |
custom_prompt += " Please examine the image carefully and extract ALL text content visible in the document."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1147 |
|
1148 |
# Extract structured data using the appropriate model, with a single API call
|
1149 |
if use_vision:
|
@@ -1153,6 +1245,13 @@ class StructuredOCR:
|
|
1153 |
logger.info(f"Using text-only model: {TEXT_MODEL}")
|
1154 |
result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name, custom_prompt)
|
1155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1156 |
# Store the serialized OCR response for image rendering (for compatibility with original version)
|
1157 |
# Don't store raw_response directly as it's not JSON serializable
|
1158 |
serialized_response = serialize_ocr_response(image_response)
|
@@ -1160,7 +1259,6 @@ class StructuredOCR:
|
|
1160 |
|
1161 |
# Store key parts of the OCR response for image rendering
|
1162 |
# With serialized format that can be stored in JSON
|
1163 |
-
has_images = hasattr(image_response, 'pages') and image_response.pages and hasattr(image_response.pages[0], 'images') and image_response.pages[0].images
|
1164 |
result['has_images'] = has_images
|
1165 |
|
1166 |
if has_images:
|
@@ -1273,10 +1371,6 @@ class StructuredOCR:
|
|
1273 |
logger.info("Test mode or no API key, using text-only processing")
|
1274 |
return self._extract_structured_data_text_only(ocr_markdown, filename)
|
1275 |
|
1276 |
-
# Detect document type with optimized cached implementation
|
1277 |
-
doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
|
1278 |
-
logger.info(f"Detected document type: {doc_type}")
|
1279 |
-
|
1280 |
# Use only the first part of OCR text to keep prompts small and processing fast
|
1281 |
if len(ocr_markdown) > 1000:
|
1282 |
truncated_ocr = ocr_markdown[:1000]
|
@@ -1284,8 +1378,26 @@ class StructuredOCR:
|
|
1284 |
else:
|
1285 |
truncated_ocr = ocr_markdown
|
1286 |
|
1287 |
-
# Build
|
1288 |
-
enhanced_prompt =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1289 |
|
1290 |
# Measure API call time for optimization feedback
|
1291 |
start_time = time.time()
|
@@ -1294,7 +1406,7 @@ class StructuredOCR:
|
|
1294 |
# Use a fixed, shorter timeout for single-page documents
|
1295 |
timeout_ms = 45000 # 45 seconds is optimal for most single-page documents
|
1296 |
|
1297 |
-
logger.info(f"Calling vision model with {timeout_ms}ms timeout
|
1298 |
chat_response = self.client.chat.parse(
|
1299 |
model=VISION_MODEL,
|
1300 |
messages=[
|
@@ -1312,7 +1424,7 @@ class StructuredOCR:
|
|
1312 |
)
|
1313 |
|
1314 |
api_time = time.time() - start_time
|
1315 |
-
logger.info(f"Vision model completed in {api_time:.2f}s
|
1316 |
|
1317 |
except Exception as e:
|
1318 |
# If there's an error with the enhanced prompt, try progressively simpler approaches
|
@@ -1392,42 +1504,16 @@ class StructuredOCR:
|
|
1392 |
if 'languages' in result:
|
1393 |
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
1394 |
|
1395 |
-
# Add metadata about processing
|
1396 |
result['processing_info'] = {
|
1397 |
'method': 'vision_model',
|
1398 |
-
'document_type': doc_type,
|
1399 |
'ocr_text_length': len(ocr_markdown),
|
1400 |
'api_response_time': time.time() - start_time
|
1401 |
}
|
1402 |
|
1403 |
-
#
|
1404 |
if custom_prompt:
|
1405 |
result['custom_prompt_applied'] = 'vision_model'
|
1406 |
-
|
1407 |
-
# Attempt to detect document type from custom prompt
|
1408 |
-
if "DOCUMENT TYPE:" in custom_prompt:
|
1409 |
-
doc_type_line = custom_prompt.split("\n")[0]
|
1410 |
-
if "DOCUMENT TYPE:" in doc_type_line:
|
1411 |
-
custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
|
1412 |
-
result['detected_document_type'] = custom_doc_type
|
1413 |
-
# Keyword-based detection as fallback
|
1414 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
|
1415 |
-
result['detected_document_type'] = "newspaper"
|
1416 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
|
1417 |
-
result['detected_document_type'] = "letter"
|
1418 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
|
1419 |
-
result['detected_document_type'] = "book"
|
1420 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
|
1421 |
-
result['detected_document_type'] = "form"
|
1422 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
|
1423 |
-
result['detected_document_type'] = "recipe"
|
1424 |
-
elif "this is a" in custom_prompt.lower():
|
1425 |
-
# Extract document type from "This is a [type]" format
|
1426 |
-
this_is_parts = custom_prompt.lower().split("this is a ")
|
1427 |
-
if len(this_is_parts) > 1:
|
1428 |
-
extracted_type = this_is_parts[1].split(".")[0].strip()
|
1429 |
-
if extracted_type:
|
1430 |
-
result['detected_document_type'] = extracted_type
|
1431 |
|
1432 |
# Add confidence score if not present
|
1433 |
if 'confidence_score' not in result:
|
@@ -1440,268 +1526,38 @@ class StructuredOCR:
|
|
1440 |
|
1441 |
return result
|
1442 |
|
1443 |
-
#
|
1444 |
-
_doc_type_cache = {}
|
1445 |
-
_doc_type_cache_size = 256
|
1446 |
-
|
1447 |
-
@staticmethod
|
1448 |
-
def _detect_document_type_cached(custom_prompt: Optional[str], ocr_text_sample: str) -> str:
|
1449 |
-
"""
|
1450 |
-
Cached version of document type detection logic with thread-safe implementation
|
1451 |
-
"""
|
1452 |
-
# Generate cache key - use first 50 chars of prompt and ocr_text to avoid memory issues
|
1453 |
-
prompt_key = str(custom_prompt)[:50] if custom_prompt else ""
|
1454 |
-
text_key = ocr_text_sample[:50] if ocr_text_sample else ""
|
1455 |
-
cache_key = f"{prompt_key}::{text_key}"
|
1456 |
-
|
1457 |
-
# Check cache first (fast path)
|
1458 |
-
if cache_key in StructuredOCR._doc_type_cache:
|
1459 |
-
return StructuredOCR._doc_type_cache[cache_key]
|
1460 |
-
|
1461 |
-
# Set default document type
|
1462 |
-
doc_type = "general"
|
1463 |
-
|
1464 |
-
# Optimized pattern matching with compiled lookup dictionaries
|
1465 |
-
doc_type_patterns = {
|
1466 |
-
"handwritten": ["handwritten", "handwriting", "cursive", "manuscript"],
|
1467 |
-
"letter": ["letter", "correspondence", "message", "dear sir", "dear madam", "sincerely", "yours truly"],
|
1468 |
-
"legal": ["form", "contract", "agreement", "legal", "certificate", "court", "attorney", "plaintiff", "defendant"],
|
1469 |
-
"recipe": ["recipe", "food", "ingredients", "directions", "tbsp", "tsp", "cup", "mix", "bake", "cooking"],
|
1470 |
-
"travel": ["travel", "expedition", "journey", "exploration", "voyage", "destination", "map"],
|
1471 |
-
"scientific": ["scientific", "experiment", "hypothesis", "research", "study", "analysis", "results", "procedure"],
|
1472 |
-
"newspaper": ["news", "newspaper", "article", "press", "headline", "column", "editor"]
|
1473 |
-
}
|
1474 |
-
|
1475 |
-
# Fast custom prompt matching
|
1476 |
-
if custom_prompt:
|
1477 |
-
prompt_lower = custom_prompt.lower()
|
1478 |
-
|
1479 |
-
# Optimized pattern matching with early exit
|
1480 |
-
for detected_type, patterns in doc_type_patterns.items():
|
1481 |
-
if any(term in prompt_lower for term in patterns):
|
1482 |
-
doc_type = detected_type
|
1483 |
-
break
|
1484 |
-
|
1485 |
-
# Fast OCR text matching if still general type
|
1486 |
-
if doc_type == "general" and ocr_text_sample:
|
1487 |
-
ocr_lower = ocr_text_sample.lower()
|
1488 |
-
|
1489 |
-
# Use the same patterns dictionary for consistency, but scan the OCR text
|
1490 |
-
for detected_type, patterns in doc_type_patterns.items():
|
1491 |
-
if any(term in ocr_lower for term in patterns):
|
1492 |
-
doc_type = detected_type
|
1493 |
-
break
|
1494 |
-
|
1495 |
-
# Cache the result with improved LRU-like behavior
|
1496 |
-
if len(StructuredOCR._doc_type_cache) >= StructuredOCR._doc_type_cache_size:
|
1497 |
-
# Clear multiple entries at once for better performance
|
1498 |
-
try:
|
1499 |
-
# Remove up to 20 entries to avoid frequent cache clearing
|
1500 |
-
for _ in range(20):
|
1501 |
-
if StructuredOCR._doc_type_cache:
|
1502 |
-
StructuredOCR._doc_type_cache.pop(next(iter(StructuredOCR._doc_type_cache)))
|
1503 |
-
except:
|
1504 |
-
# If concurrent modification causes issues, just proceed
|
1505 |
-
pass
|
1506 |
-
|
1507 |
-
# Store in cache
|
1508 |
-
StructuredOCR._doc_type_cache[cache_key] = doc_type
|
1509 |
-
|
1510 |
-
return doc_type
|
1511 |
-
|
1512 |
-
def _detect_document_type(self, custom_prompt: Optional[str], ocr_text: str) -> str:
|
1513 |
-
"""
|
1514 |
-
Detect document type based on content and custom prompt.
|
1515 |
-
|
1516 |
-
Args:
|
1517 |
-
custom_prompt: User-provided custom prompt
|
1518 |
-
ocr_text: OCR-extracted text
|
1519 |
-
|
1520 |
-
Returns:
|
1521 |
-
Document type identifier ("handwritten", "printed", "letter", etc.)
|
1522 |
-
"""
|
1523 |
-
# Only sample first 1000 characters of OCR text for faster processing while maintaining accuracy
|
1524 |
-
ocr_sample = ocr_text[:1000] if ocr_text else ""
|
1525 |
-
|
1526 |
-
# Use the cached version for better performance
|
1527 |
-
return self._detect_document_type_cached(custom_prompt, ocr_sample)
|
1528 |
-
|
1529 |
-
def _build_enhanced_prompt(self, doc_type: str, ocr_text: str, custom_prompt: Optional[str]) -> str:
|
1530 |
-
"""
|
1531 |
-
Build an optimized prompt focused on OCR accuracy with specialized attention to
|
1532 |
-
historical typography, manuscript conventions, and document deterioration patterns.
|
1533 |
|
1534 |
-
|
1535 |
-
doc_type: Detected document type
|
1536 |
-
ocr_text: OCR-extracted text
|
1537 |
-
custom_prompt: User-provided custom prompt
|
1538 |
-
|
1539 |
-
Returns:
|
1540 |
-
Optimized prompt focused on text extraction with historical document expertise
|
1541 |
-
"""
|
1542 |
-
# Generic document section (included in all prompts)
|
1543 |
generic_section = (
|
1544 |
-
f"
|
1545 |
-
f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1546 |
)
|
1547 |
|
1548 |
-
# Check if custom prompt contains document type information
|
1549 |
-
has_custom_doc_type = False
|
1550 |
-
custom_doc_type = ""
|
1551 |
-
|
1552 |
-
if custom_prompt and "DOCUMENT TYPE:" in custom_prompt:
|
1553 |
-
# Extract the document type from the custom prompt
|
1554 |
-
doc_type_line = custom_prompt.split("\n")[0]
|
1555 |
-
if "DOCUMENT TYPE:" in doc_type_line:
|
1556 |
-
custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip()
|
1557 |
-
has_custom_doc_type = True
|
1558 |
-
# If we have a custom doc type, use it instead
|
1559 |
-
if custom_doc_type:
|
1560 |
-
doc_type = custom_doc_type.lower()
|
1561 |
-
|
1562 |
-
# If user has provided detailed instructions, provide more elaborate prompting
|
1563 |
-
if custom_prompt and (has_custom_doc_type or len(custom_prompt.strip()) > 20):
|
1564 |
-
# Enhanced prompt for documents with custom instructions and historical expertise
|
1565 |
-
specific_section = (
|
1566 |
-
f"You are an advanced OCR specialist with expertise in historical documents, typography, and manuscript conventions. "
|
1567 |
-
f"Below is a document that requires specialized analysis with attention to historical characteristics. "
|
1568 |
-
f"Pay particular attention to:\n"
|
1569 |
-
f"- Historical typography features (long s 'ſ', ligatures, obsolete letter forms)\n"
|
1570 |
-
f"- Manuscript conventions of the period (abbreviations, contractions, marginalia)\n"
|
1571 |
-
f"- Document deterioration patterns (faded ink, foxing, water damage, paper degradation)\n"
|
1572 |
-
f"- Accurately capturing ALL text content visible in the image with historical context\n"
|
1573 |
-
f"- Following the specific user instructions for processing this document type\n"
|
1574 |
-
f"- Identifying key information, structure, and historical formatting conventions\n"
|
1575 |
-
f"- Providing comprehensive analysis with attention to historical context\n"
|
1576 |
-
)
|
1577 |
-
|
1578 |
-
# Add specialized instructions based on document type
|
1579 |
-
if doc_type == "newspaper":
|
1580 |
-
specific_section += (
|
1581 |
-
f"\nThis appears to be a newspaper or document with columns. "
|
1582 |
-
f"Please read each column from top to bottom, then move to the next column. "
|
1583 |
-
f"Extract all article titles, headings, bylines, and body text in the correct reading order. "
|
1584 |
-
f"Pay special attention to section headers, page numbers, publication date, and newspaper name. "
|
1585 |
-
f"For historical newspapers, be aware of period-specific typography such as the long s (ſ), "
|
1586 |
-
f"unique ligatures (æ, œ, ct, st), and decorative fonts. Account for paper degradation around "
|
1587 |
-
f"fold lines and edges. Recognize archaic abbreviations and typesetting conventions of the period.\n"
|
1588 |
-
)
|
1589 |
-
elif doc_type == "letter":
|
1590 |
-
specific_section += (
|
1591 |
-
f"\nThis appears to be a letter or correspondence. "
|
1592 |
-
f"Pay special attention to the letterhead, date, greeting, body content, closing, and signature. "
|
1593 |
-
f"Preserve the original formatting including paragraph breaks and indentation. "
|
1594 |
-
f"Note any handwritten annotations or marginalia separately. "
|
1595 |
-
f"For historical letters, carefully transcribe historical scripts and handwriting styles, "
|
1596 |
-
f"noting unclear or damaged sections. Identify period-specific salutations, closings, and "
|
1597 |
-
f"formalities. Watch for ink fading, bleeding, and seepage through pages. "
|
1598 |
-
f"Recognize period-specific abbreviations (ye, yr, inst, ult, prox) and long s (ſ) in older printed correspondence.\n"
|
1599 |
-
)
|
1600 |
-
elif doc_type == "book":
|
1601 |
-
specific_section += (
|
1602 |
-
f"\nThis appears to be a book or publication page. "
|
1603 |
-
f"Pay attention to chapter titles, headers, page numbers, footnotes, and main body text. "
|
1604 |
-
f"Preserve paragraph structure and any special formatting. "
|
1605 |
-
f"Note any images, tables, or figures that might be referenced in the text. "
|
1606 |
-
f"For historical books, attend to period typography including the long s (ſ), ligatures (æ, œ, ct, ſt), "
|
1607 |
-
f"archaic letter forms, and decorative initials/drop caps. Account for foxing (brown spotting), "
|
1608 |
-
f"bleed-through from opposite pages, and binding damage. Recognize period-specific typographic "
|
1609 |
-
f"conventions like catchwords, signatures, obsolete punctuation, and historical spelling variants "
|
1610 |
-
f"(e.g., -ize/-ise, past tense 'd for -ed). Note bookplates, ownership marks, and marginalia.\n"
|
1611 |
-
)
|
1612 |
-
elif doc_type == "form":
|
1613 |
-
specific_section += (
|
1614 |
-
f"\nThis appears to be a form or legal document. "
|
1615 |
-
f"Carefully extract all field labels and their corresponding values. "
|
1616 |
-
f"Preserve the structure of form fields and sections. "
|
1617 |
-
f"Pay special attention to signature lines, dates, and any official markings. "
|
1618 |
-
f"For historical forms and legal documents, recognize period-specific legal terminology and "
|
1619 |
-
f"formulaic phrases. Note seals, stamps, watermarks, and official emblems. Watch for faded ink "
|
1620 |
-
f"in signatures and filled fields. Identify period handwriting styles in completed sections. "
|
1621 |
-
f"Account for specialized legal abbreviations (e.g., SS., Esq., inst., wit.) and archaic "
|
1622 |
-
f"measurement units. Note folding patterns and worn edges common in frequently handled legal documents.\n"
|
1623 |
-
)
|
1624 |
-
elif doc_type == "recipe":
|
1625 |
-
specific_section += (
|
1626 |
-
f"\nThis appears to be a recipe or food-related document. "
|
1627 |
-
f"Extract the recipe title, ingredient list (with measurements), preparation steps, "
|
1628 |
-
f"cooking times, serving information, and any notes or tips. "
|
1629 |
-
f"Maintain the distinction between ingredients and preparation instructions. "
|
1630 |
-
f"For historical recipes, attend to archaic measurements (gill, dram, peck, firkin), obsolete "
|
1631 |
-
f"cooking terminology, and period-specific ingredients and their modern equivalents. Note handwritten "
|
1632 |
-
f"annotations and personal modifications. Identify period-specific cooking methods and tools that "
|
1633 |
-
f"might need explanation. Watch for liquid stains and food residue common on well-used recipe pages. "
|
1634 |
-
f"Recognize unclear fractions and temperature instructions (e.g., 'slow oven', 'quick fire').\n"
|
1635 |
-
)
|
1636 |
-
|
1637 |
-
# Output instructions (enhanced for custom requests)
|
1638 |
-
output_section = (
|
1639 |
-
f"Create a detailed structured JSON response with the following fields:\n"
|
1640 |
-
f"- file_name: The document's name\n"
|
1641 |
-
f"- topics: An array of specific topics, themes, or subjects covered in the document\n"
|
1642 |
-
f"- languages: An array of languages used in the document\n"
|
1643 |
-
f"- ocr_contents: A comprehensive dictionary with the document's contents including:\n"
|
1644 |
-
f" * title: The main title or heading\n"
|
1645 |
-
f" * subtitle: Any subtitle or secondary heading (if present)\n"
|
1646 |
-
f" * date: Publication or document date (if present)\n"
|
1647 |
-
f" * author: Author or creator information (if present)\n"
|
1648 |
-
f" * content: The main body content, properly formatted\n"
|
1649 |
-
f" * additional sections as appropriate for this document type\n"
|
1650 |
-
f" * raw_text: The complete OCR text\n"
|
1651 |
-
)
|
1652 |
-
else:
|
1653 |
-
# Default processing with basic historical document awareness
|
1654 |
-
specific_section = (
|
1655 |
-
f"You are an OCR specialist with knowledge of historical documents and typography. "
|
1656 |
-
f"Focus on accurately extracting text content with attention to historical features. "
|
1657 |
-
f"Pay special attention to:\n"
|
1658 |
-
f"- Accurately capturing ALL text content visible in the image\n"
|
1659 |
-
f"- Maintaining the correct reading order and structure\n"
|
1660 |
-
f"- Preserving paragraph breaks and text layout\n"
|
1661 |
-
f"- Identifying the main document type, time period, and language\n"
|
1662 |
-
f"- Recognizing historical typography features (long s 'ſ', ligatures, archaic characters)\n"
|
1663 |
-
f"- Accounting for document deterioration (faded ink, stains, foxing, physical damage)\n"
|
1664 |
-
)
|
1665 |
-
|
1666 |
-
# Only add specialized instructions for newspapers with columns
|
1667 |
-
if doc_type == "newspaper":
|
1668 |
-
specific_section += (
|
1669 |
-
f"\nThis appears to be a document with columns. "
|
1670 |
-
f"Be sure to read each column from top to bottom, then move to the next column. "
|
1671 |
-
f"Extract all article titles, headings, and body text.\n"
|
1672 |
-
)
|
1673 |
-
|
1674 |
-
# Simple output instructions for default cases
|
1675 |
-
output_section = (
|
1676 |
-
f"Create a structured JSON response with the following fields:\n"
|
1677 |
-
f"- file_name: The document's name\n"
|
1678 |
-
f"- topics: An array of topics covered in the document\n"
|
1679 |
-
f"- languages: An array of languages used in the document\n"
|
1680 |
-
f"- ocr_contents: A dictionary with the document's contents, with the focus on complete text extraction\n"
|
1681 |
-
)
|
1682 |
-
|
1683 |
# Add custom prompt if provided
|
1684 |
custom_section = ""
|
1685 |
if custom_prompt:
|
1686 |
-
|
1687 |
-
if "USER INSTRUCTIONS:" in custom_prompt:
|
1688 |
-
instructions_part = custom_prompt.split("USER INSTRUCTIONS:")[1].strip()
|
1689 |
-
custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
|
1690 |
-
elif "INSTRUCTIONS:" in custom_prompt:
|
1691 |
-
instructions_part = custom_prompt.split("INSTRUCTIONS:")[1].strip()
|
1692 |
-
custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
|
1693 |
-
else:
|
1694 |
-
# Strip custom prompt to essentials
|
1695 |
-
stripped_prompt = custom_prompt.replace("This is a", "").replace("It appears to be a", "")
|
1696 |
-
custom_section = f"\n\nUser-provided instructions: {stripped_prompt}\n"
|
1697 |
|
1698 |
-
#
|
1699 |
-
return generic_section +
|
1700 |
|
1701 |
def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
|
1702 |
"""
|
1703 |
Extract structured data using text-only model with detailed historical context prompting
|
1704 |
-
and improved error handling
|
1705 |
"""
|
1706 |
logger = logging.getLogger("text_processor")
|
1707 |
start_time = time.time()
|
@@ -1710,10 +1566,68 @@ class StructuredOCR:
|
|
1710 |
# Fast path: Skip for minimal OCR text
|
1711 |
if not ocr_markdown or len(ocr_markdown.strip()) < 50:
|
1712 |
logger.info("Minimal OCR text - returning basic result")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1713 |
return {
|
1714 |
"file_name": filename,
|
1715 |
"topics": ["Document"],
|
1716 |
-
"languages":
|
1717 |
"ocr_contents": {
|
1718 |
"raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
|
1719 |
},
|
@@ -1734,10 +1648,6 @@ class StructuredOCR:
|
|
1734 |
"processing_method": "test_mode"
|
1735 |
}
|
1736 |
|
1737 |
-
# Detect document type and build enhanced prompt
|
1738 |
-
doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
|
1739 |
-
logger.info(f"Detected document type: {doc_type}")
|
1740 |
-
|
1741 |
# If OCR text is very large, truncate it to avoid API limits
|
1742 |
truncated_text = ocr_markdown
|
1743 |
if len(ocr_markdown) > 25000:
|
@@ -1745,8 +1655,25 @@ class StructuredOCR:
|
|
1745 |
truncated_text = ocr_markdown[:15000] + "\n...[content truncated]...\n" + ocr_markdown[-5000:]
|
1746 |
logger.info(f"OCR text truncated from {len(ocr_markdown)} to {len(truncated_text)} chars")
|
1747 |
|
1748 |
-
# Build
|
1749 |
-
enhanced_prompt =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1750 |
|
1751 |
# Use enhanced prompt with text-only model - with retry logic
|
1752 |
max_retries = 2
|
@@ -1784,40 +1711,14 @@ class StructuredOCR:
|
|
1784 |
if 'languages' in result:
|
1785 |
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
1786 |
|
1787 |
-
# Add processing metadata
|
1788 |
result['processing_method'] = 'text_model'
|
1789 |
-
result['document_type'] = doc_type
|
1790 |
result['model_used'] = TEXT_MODEL
|
1791 |
result['processing_time'] = time.time() - start_time
|
1792 |
|
1793 |
# Flag when custom prompt has been successfully applied
|
1794 |
if custom_prompt:
|
1795 |
result['custom_prompt_applied'] = 'text_model'
|
1796 |
-
|
1797 |
-
# Attempt to detect document type from custom prompt
|
1798 |
-
if "DOCUMENT TYPE:" in custom_prompt:
|
1799 |
-
doc_type_line = custom_prompt.split("\n")[0]
|
1800 |
-
if "DOCUMENT TYPE:" in doc_type_line:
|
1801 |
-
custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
|
1802 |
-
result['detected_document_type'] = custom_doc_type
|
1803 |
-
# Keyword-based detection as fallback
|
1804 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
|
1805 |
-
result['detected_document_type'] = "newspaper"
|
1806 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
|
1807 |
-
result['detected_document_type'] = "letter"
|
1808 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
|
1809 |
-
result['detected_document_type'] = "book"
|
1810 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
|
1811 |
-
result['detected_document_type'] = "form"
|
1812 |
-
elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
|
1813 |
-
result['detected_document_type'] = "recipe"
|
1814 |
-
elif "this is a" in custom_prompt.lower():
|
1815 |
-
# Extract document type from "This is a [type]" format
|
1816 |
-
this_is_parts = custom_prompt.lower().split("this is a ")
|
1817 |
-
if len(this_is_parts) > 1:
|
1818 |
-
extracted_type = this_is_parts[1].split(".")[0].strip()
|
1819 |
-
if extracted_type:
|
1820 |
-
result['detected_document_type'] = extracted_type
|
1821 |
|
1822 |
# Add raw text for reference if not already present
|
1823 |
if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
|
@@ -1880,18 +1781,7 @@ class StructuredOCR:
|
|
1880 |
"processing_time": time.time() - start_time
|
1881 |
}
|
1882 |
|
1883 |
-
#
|
1884 |
-
if ocr_markdown:
|
1885 |
-
# Simple content analysis
|
1886 |
-
text_sample = ocr_markdown[:5000].lower()
|
1887 |
-
|
1888 |
-
# Try to detect language
|
1889 |
-
if "dear" in text_sample and any(word in text_sample for word in ["sincerely", "regards", "truly"]):
|
1890 |
-
result["topics"].append("Letter")
|
1891 |
-
elif any(word in text_sample for word in ["recipe", "ingredients", "instructions", "cook", "bake"]):
|
1892 |
-
result["topics"].append("Recipe")
|
1893 |
-
elif any(word in text_sample for word in ["article", "report", "study", "analysis"]):
|
1894 |
-
result["topics"].append("Article")
|
1895 |
|
1896 |
except Exception as inner_e:
|
1897 |
logger.error(f"Error creating basic result: {str(inner_e)}")
|
@@ -1919,4 +1809,4 @@ if __name__ == "__main__":
|
|
1919 |
processor = StructuredOCR()
|
1920 |
result = processor.process_file(file_path)
|
1921 |
|
1922 |
-
print(json.dumps(result, indent=2))
|
|
|
1 |
+
# Standard library imports
|
2 |
import os
|
3 |
import sys
|
4 |
import time
|
5 |
import random
|
|
|
|
|
6 |
import json
|
7 |
import base64
|
8 |
import logging
|
9 |
+
from enum import Enum
|
10 |
+
from pathlib import Path
|
11 |
from functools import lru_cache
|
12 |
from typing import Optional, Dict, Any, List, Union, Tuple
|
13 |
|
14 |
+
# Configure logging
|
15 |
+
logging.basicConfig(level=logging.INFO,
|
16 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
# Third-party imports
|
20 |
+
from pydantic import BaseModel
|
21 |
+
|
22 |
# Try to import pycountry, provide fallback if not available
|
23 |
try:
|
24 |
import pycountry
|
25 |
PYCOUNTRY_AVAILABLE = True
|
26 |
except ImportError:
|
27 |
PYCOUNTRY_AVAILABLE = False
|
28 |
+
logger.warning("pycountry module not available - using language code fallback")
|
|
|
|
|
29 |
|
30 |
# Try to import Mistral AI, provide fallback if not available
|
31 |
try:
|
|
|
35 |
MISTRAL_AVAILABLE = True
|
36 |
except ImportError:
|
37 |
MISTRAL_AVAILABLE = False
|
38 |
+
logger.warning("mistralai module not available - OCR functionality will be limited")
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Import utilities for OCR processing
|
41 |
try:
|
|
|
219 |
if file_type is None:
|
220 |
suffix = file_path.suffix.lower()
|
221 |
file_type = "pdf" if suffix == ".pdf" else "image"
|
222 |
+
|
223 |
+
# Check for handwritten document by filename
|
224 |
+
filename_lower = file_path.name.lower()
|
225 |
+
if "handwritten" in filename_lower or "manuscript" in filename_lower or "letter" in filename_lower:
|
226 |
+
logger.info(f"Detected likely handwritten document from filename: {file_path.name}")
|
227 |
+
# This will be used during processing to apply handwritten-specific handling
|
228 |
|
229 |
# Get file size if not provided
|
230 |
if file_size_mb is None and file_path.exists():
|
|
|
446 |
# Convert only the selected pages to minimize memory usage
|
447 |
selected_images = []
|
448 |
combined_text = []
|
449 |
+
detected_languages = set() # Track detected languages across all pages
|
450 |
|
451 |
# Process pages in larger batches for better efficiency
|
452 |
batch_size = 5 # Process 5 pages at a time for better throughput
|
|
|
482 |
# Add page text to combined text without obvious page markers
|
483 |
page_text = page_result['ocr_contents']['raw_text']
|
484 |
combined_text.append(f"{page_text}")
|
485 |
+
|
486 |
+
# Collect detected languages from each page
|
487 |
+
if 'languages' in page_result:
|
488 |
+
for lang in page_result['languages']:
|
489 |
+
detected_languages.add(lang)
|
490 |
except Exception as page_e:
|
491 |
logger.warning(f"Error processing page {page_num}: {str(page_e)}")
|
492 |
# Clean up temp file
|
|
|
524 |
# Add flag to indicate custom prompt was applied
|
525 |
result['custom_prompt_applied'] = 'text_only'
|
526 |
|
527 |
+
# Simplified approach - no document type detection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
|
529 |
except Exception as e:
|
530 |
logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
|
|
|
538 |
if 'ocr_contents' in result:
|
539 |
result['ocr_contents']['raw_text'] = all_text
|
540 |
|
541 |
+
# Merge detected languages if available
|
542 |
+
if detected_languages:
|
543 |
+
result['languages'] = list(detected_languages)
|
544 |
+
|
545 |
# Add PDF metadata
|
546 |
result['file_name'] = file_path.name
|
547 |
result['pdf_processing_method'] = 'pdf2image_optimized'
|
|
|
709 |
limited_pages = True
|
710 |
logger.info(f"Processing {len(pages_to_process)} pages out of {total_pages} total")
|
711 |
|
712 |
+
# Directly extract any language information from the OCR response
|
713 |
+
detected_languages = set()
|
714 |
+
|
715 |
+
# Check if the response has a 'languages' attribute in any form
|
716 |
+
# First check direct attributes on the response object
|
717 |
+
if hasattr(pdf_response, 'languages') and pdf_response.languages:
|
718 |
+
for lang in pdf_response.languages:
|
719 |
+
detected_languages.add(str(lang))
|
720 |
+
logger.info(f"Found language in OCR response: {lang}")
|
721 |
+
|
722 |
+
# Then check if it's in the response as a dictionary format
|
723 |
+
elif hasattr(pdf_response, '__dict__'):
|
724 |
+
response_dict = pdf_response.__dict__
|
725 |
+
if 'languages' in response_dict and response_dict['languages']:
|
726 |
+
for lang in response_dict['languages']:
|
727 |
+
detected_languages.add(str(lang))
|
728 |
+
logger.info(f"Found language in OCR response dict: {lang}")
|
729 |
+
|
730 |
# Calculate confidence score if available
|
731 |
try:
|
732 |
confidence_values = [page.confidence for page in pages_to_process if hasattr(page, 'confidence')]
|
|
|
749 |
if page_markdown.strip():
|
750 |
all_markdown.append(f"{page_markdown}")
|
751 |
|
752 |
+
# Collect language information from individual pages if available
|
753 |
+
if hasattr(page, 'languages') and page.languages:
|
754 |
+
for lang in page.languages:
|
755 |
+
detected_languages.add(str(lang))
|
756 |
+
logger.info(f"Found language in page {page_num}: {lang}")
|
757 |
+
|
758 |
# Join all pages with separation
|
759 |
combined_markdown = "\n\n".join(all_markdown)
|
760 |
|
|
|
788 |
combined_markdown, file_path.name, custom_prompt
|
789 |
)
|
790 |
|
791 |
+
# If we have detected languages directly from the OCR model, use them
|
792 |
+
if detected_languages:
|
793 |
+
logger.info(f"Using languages detected by OCR model: {', '.join(detected_languages)}")
|
794 |
+
result['languages'] = list(detected_languages)
|
795 |
+
# Add flag to indicate source of language detection
|
796 |
+
result['language_detection_source'] = 'mistral-ocr-latest'
|
797 |
+
|
798 |
# Add metadata about pages
|
799 |
if limited_pages:
|
800 |
result['limited_pages'] = {
|
|
|
956 |
"confidence_score": 0.0
|
957 |
}
|
958 |
|
959 |
+
# Check if this is likely a newspaper or handwritten document by filename
|
960 |
is_likely_newspaper = False
|
961 |
+
is_likely_handwritten = False
|
962 |
+
|
963 |
newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
|
964 |
"chronicle", "post", "tribune", "news", "press", "gender"]
|
965 |
+
|
966 |
+
handwritten_keywords = ["handwritten", "manuscript", "letter", "correspondence", "journal", "diary"]
|
967 |
|
968 |
+
# Check filename for document type indicators
|
969 |
filename_lower = file_path.name.lower()
|
970 |
+
|
971 |
+
# First check for handwritten documents
|
972 |
+
for keyword in handwritten_keywords:
|
973 |
if keyword in filename_lower:
|
974 |
+
is_likely_handwritten = True
|
975 |
+
logger.info(f"Likely handwritten document detected from filename: {file_path.name}")
|
976 |
+
# Add handwritten-specific processing hint to custom_prompt if not already present
|
977 |
if custom_prompt:
|
978 |
+
if "handwritten" not in custom_prompt.lower():
|
979 |
+
custom_prompt = custom_prompt + " This appears to be a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations."
|
980 |
else:
|
981 |
+
custom_prompt = "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations."
|
982 |
break
|
983 |
+
|
984 |
+
# Then check for newspaper if not handwritten
|
985 |
+
if not is_likely_handwritten:
|
986 |
+
for keyword in newspaper_keywords:
|
987 |
+
if keyword in filename_lower:
|
988 |
+
is_likely_newspaper = True
|
989 |
+
logger.info(f"Likely newspaper document detected from filename: {file_path.name}")
|
990 |
+
# Add newspaper-specific processing hint to custom_prompt if not already present
|
991 |
+
if custom_prompt:
|
992 |
+
if "column" not in custom_prompt.lower() and "newspaper" not in custom_prompt.lower():
|
993 |
+
custom_prompt = custom_prompt + " This appears to be a newspaper or document with columns. Please extract all text content from each column."
|
994 |
+
else:
|
995 |
+
custom_prompt = "This appears to be a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
|
996 |
+
break
|
997 |
|
998 |
try:
|
999 |
# Check file size
|
|
|
1066 |
logger.info(f"Resized image to {new_size_mb:.2f} MB")
|
1067 |
except ImportError:
|
1068 |
logger.warning("PIL not available for resizing. Using original image.")
|
1069 |
+
# Use enhanced encoder with proper MIME type detection
|
1070 |
+
from ocr_utils import encode_image_for_api
|
1071 |
+
base64_data_url = encode_image_for_api(file_path)
|
1072 |
except Exception as e:
|
1073 |
logger.warning(f"Image resize failed: {str(e)}. Using original image.")
|
1074 |
+
# Use enhanced encoder with proper MIME type detection
|
1075 |
+
from ocr_utils import encode_image_for_api
|
1076 |
+
base64_data_url = encode_image_for_api(file_path)
|
1077 |
else:
|
1078 |
+
# For smaller images, use as-is with proper MIME type
|
1079 |
+
from ocr_utils import encode_image_for_api
|
1080 |
+
base64_data_url = encode_image_for_api(file_path)
|
1081 |
except Exception as e:
|
1082 |
# Fallback to original image if any preprocessing fails
|
1083 |
logger.warning(f"Image preprocessing failed: {str(e)}. Using original image.")
|
1084 |
+
# Use enhanced encoder with proper MIME type detection
|
1085 |
+
from ocr_utils import encode_image_for_api
|
1086 |
+
base64_data_url = encode_image_for_api(file_path)
|
1087 |
|
1088 |
# Process the image with OCR
|
1089 |
logger.info(f"Processing image with OCR using {OCR_MODEL}")
|
|
|
1175 |
# Get the OCR markdown from the first page
|
1176 |
image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
|
1177 |
|
1178 |
+
# Check if the OCR response has images
|
1179 |
+
has_images = hasattr(image_response, 'pages') and image_response.pages and hasattr(image_response.pages[0], 'images') and image_response.pages[0].images
|
1180 |
+
|
1181 |
+
# Check for language information directly from the OCR model
|
1182 |
+
detected_languages = set()
|
1183 |
+
|
1184 |
+
# Check if the response has a 'languages' attribute in any form
|
1185 |
+
# First check direct attributes on the response object
|
1186 |
+
if hasattr(image_response, 'languages') and image_response.languages:
|
1187 |
+
for lang in image_response.languages:
|
1188 |
+
detected_languages.add(str(lang))
|
1189 |
+
logger.info(f"Found language in OCR response: {lang}")
|
1190 |
+
|
1191 |
+
# Then check if it's in the response as a dictionary format
|
1192 |
+
elif hasattr(image_response, '__dict__'):
|
1193 |
+
response_dict = image_response.__dict__
|
1194 |
+
if 'languages' in response_dict and response_dict['languages']:
|
1195 |
+
for lang in response_dict['languages']:
|
1196 |
+
detected_languages.add(str(lang))
|
1197 |
+
logger.info(f"Found language in OCR response dict: {lang}")
|
1198 |
+
|
1199 |
+
# Check for languages in individual pages
|
1200 |
+
if hasattr(image_response, 'pages') and image_response.pages:
|
1201 |
+
for page in image_response.pages:
|
1202 |
+
if hasattr(page, 'languages') and page.languages:
|
1203 |
+
for lang in page.languages:
|
1204 |
+
detected_languages.add(str(lang))
|
1205 |
+
logger.info(f"Found language in page: {lang}")
|
1206 |
+
|
1207 |
# Optimize: Skip vision model step if ocr_markdown is very small or empty
|
1208 |
# BUT make an exception for newspapers or if custom_prompt is provided
|
1209 |
+
# OR if the image has visual content worth preserving
|
1210 |
+
if (not is_likely_newspaper and not custom_prompt and not has_images) and (not image_ocr_markdown or len(image_ocr_markdown) < 50):
|
1211 |
+
logger.warning("OCR produced minimal text with no images. Returning basic result.")
|
1212 |
return {
|
1213 |
"file_name": file_path.name,
|
1214 |
"topics": ["Document"],
|
|
|
1216 |
"ocr_contents": {
|
1217 |
"raw_text": image_ocr_markdown if image_ocr_markdown else "No text could be extracted from the image."
|
1218 |
},
|
1219 |
+
"processing_note": "OCR produced minimal text content",
|
1220 |
+
# Include raw response data for images
|
1221 |
+
"raw_response_data": serialize_ocr_response(image_response)
|
1222 |
}
|
1223 |
|
1224 |
# For newspapers with little text in OCR, set a more explicit prompt
|
|
|
1228 |
custom_prompt = "This is a newspaper or document with columns. The OCR may not have captured all text. Please examine the image carefully and extract ALL text content visible in the document, reading each column from top to bottom."
|
1229 |
elif "extract all text" not in custom_prompt.lower():
|
1230 |
custom_prompt += " Please examine the image carefully and extract ALL text content visible in the document."
|
1231 |
+
|
1232 |
+
# For images with minimal text but visual content, enhance the prompt
|
1233 |
+
elif has_images and (not image_ocr_markdown or len(image_ocr_markdown) < 100):
|
1234 |
+
logger.info("Document with images but minimal text detected. Using enhanced prompt for mixed media.")
|
1235 |
+
if not custom_prompt:
|
1236 |
+
custom_prompt = "This is a mixed media document with both text and important visual elements. Please carefully describe the image content and extract all visible text, preserving the relationship between text and visuals."
|
1237 |
+
elif "visual" not in custom_prompt.lower() and "image" not in custom_prompt.lower():
|
1238 |
+
custom_prompt += " The document contains important visual elements that should be described along with the text content."
|
1239 |
|
1240 |
# Extract structured data using the appropriate model, with a single API call
|
1241 |
if use_vision:
|
|
|
1245 |
logger.info(f"Using text-only model: {TEXT_MODEL}")
|
1246 |
result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name, custom_prompt)
|
1247 |
|
1248 |
+
# If we have detected languages directly from the OCR model, use them
|
1249 |
+
if detected_languages:
|
1250 |
+
logger.info(f"Using languages detected by OCR model: {', '.join(detected_languages)}")
|
1251 |
+
result['languages'] = list(detected_languages)
|
1252 |
+
# Add flag to indicate source of language detection
|
1253 |
+
result['language_detection_source'] = 'mistral-ocr-latest'
|
1254 |
+
|
1255 |
# Store the serialized OCR response for image rendering (for compatibility with original version)
|
1256 |
# Don't store raw_response directly as it's not JSON serializable
|
1257 |
serialized_response = serialize_ocr_response(image_response)
|
|
|
1259 |
|
1260 |
# Store key parts of the OCR response for image rendering
|
1261 |
# With serialized format that can be stored in JSON
|
|
|
1262 |
result['has_images'] = has_images
|
1263 |
|
1264 |
if has_images:
|
|
|
1371 |
logger.info("Test mode or no API key, using text-only processing")
|
1372 |
return self._extract_structured_data_text_only(ocr_markdown, filename)
|
1373 |
|
|
|
|
|
|
|
|
|
1374 |
# Use only the first part of OCR text to keep prompts small and processing fast
|
1375 |
if len(ocr_markdown) > 1000:
|
1376 |
truncated_ocr = ocr_markdown[:1000]
|
|
|
1378 |
else:
|
1379 |
truncated_ocr = ocr_markdown
|
1380 |
|
1381 |
+
# Build a comprehensive prompt with OCR text and detailed instructions for language detection and image handling
|
1382 |
+
enhanced_prompt = f"This is a document's OCR text:\n<BEGIN_OCR>\n{truncated_ocr}\n<END_OCR>\n\n"
|
1383 |
+
|
1384 |
+
# Add custom prompt if provided
|
1385 |
+
if custom_prompt:
|
1386 |
+
enhanced_prompt += f"User instructions: {custom_prompt}\n\n"
|
1387 |
+
|
1388 |
+
# Add comprehensive extraction instructions with language detection guidance
|
1389 |
+
enhanced_prompt += "Extract all text content accurately from this document, including any text visible in the image that may not have been captured by OCR.\n\n"
|
1390 |
+
enhanced_prompt += "IMPORTANT: Precisely identify and list ALL languages present in the document separately. Look closely for multiple languages that might appear together.\n"
|
1391 |
+
enhanced_prompt += "For language detection, examine these specific indicators:\n"
|
1392 |
+
enhanced_prompt += "- Portuguese: accents (ã, õ, á, é, ê, ó, ç), words like 'e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com'\n"
|
1393 |
+
enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con'\n"
|
1394 |
+
enhanced_prompt += "- French: accents (é, è, ê, à, ç), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du'\n"
|
1395 |
+
enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
|
1396 |
+
enhanced_prompt += "- Italian: accents (à, è, é, ì, ò, ù), words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
|
1397 |
+
enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
|
1398 |
+
enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n\n"
|
1399 |
+
enhanced_prompt += "If the document contains multiple columns or sections, process each section independently and then combine them logically.\n"
|
1400 |
+
enhanced_prompt += "Return ALL detected languages as separate entries in the languages array, never combine them."
|
1401 |
|
1402 |
# Measure API call time for optimization feedback
|
1403 |
start_time = time.time()
|
|
|
1406 |
# Use a fixed, shorter timeout for single-page documents
|
1407 |
timeout_ms = 45000 # 45 seconds is optimal for most single-page documents
|
1408 |
|
1409 |
+
logger.info(f"Calling vision model with {timeout_ms}ms timeout")
|
1410 |
chat_response = self.client.chat.parse(
|
1411 |
model=VISION_MODEL,
|
1412 |
messages=[
|
|
|
1424 |
)
|
1425 |
|
1426 |
api_time = time.time() - start_time
|
1427 |
+
logger.info(f"Vision model completed in {api_time:.2f}s")
|
1428 |
|
1429 |
except Exception as e:
|
1430 |
# If there's an error with the enhanced prompt, try progressively simpler approaches
|
|
|
1504 |
if 'languages' in result:
|
1505 |
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
1506 |
|
1507 |
+
# Add simplified metadata about processing
|
1508 |
result['processing_info'] = {
|
1509 |
'method': 'vision_model',
|
|
|
1510 |
'ocr_text_length': len(ocr_markdown),
|
1511 |
'api_response_time': time.time() - start_time
|
1512 |
}
|
1513 |
|
1514 |
+
# Note if custom prompt was applied
|
1515 |
if custom_prompt:
|
1516 |
result['custom_prompt_applied'] = 'vision_model'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1517 |
|
1518 |
# Add confidence score if not present
|
1519 |
if 'confidence_score' not in result:
|
|
|
1526 |
|
1527 |
return result
|
1528 |
|
1529 |
+
# We've removed document type detection entirely for simplicity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1530 |
|
1531 |
+
# Create a prompt with enhanced language detection instructions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1532 |
generic_section = (
|
1533 |
+
f"You are an OCR specialist processing historical documents. "
|
1534 |
+
f"Focus on accurately extracting text content while preserving structure and formatting. "
|
1535 |
+
f"Pay attention to any historical features and document characteristics.\n\n"
|
1536 |
+
f"IMPORTANT: Accurately identify the document's language(s). Look for language-specific characters, words, and phrases. "
|
1537 |
+
f"Specifically check for French (accents like é, è, ç, words like 'le', 'la', 'et', 'est'), German (umlauts, words like 'und', 'der', 'das'), "
|
1538 |
+
f"Latin, and other non-English languages. Carefully analyze the text before determining language.\n\n"
|
1539 |
+
f"Create a structured JSON response with the following fields:\n"
|
1540 |
+
f"- file_name: The document's name\n"
|
1541 |
+
f"- topics: An array of topics covered in the document\n"
|
1542 |
+
f"- languages: An array of languages used in the document (be precise and specific about language detection)\n"
|
1543 |
+
f"- ocr_contents: A comprehensive dictionary with the document's contents including:\n"
|
1544 |
+
f" * title: The main title or heading (if present)\n"
|
1545 |
+
f" * content: The main body content\n"
|
1546 |
+
f" * raw_text: The complete OCR text\n"
|
1547 |
)
|
1548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1549 |
# Add custom prompt if provided
|
1550 |
custom_section = ""
|
1551 |
if custom_prompt:
|
1552 |
+
custom_section = f"\n\nUser-provided instructions: {custom_prompt}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1553 |
|
1554 |
+
# Return the enhanced prompt
|
1555 |
+
return generic_section + custom_section
|
1556 |
|
1557 |
def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
|
1558 |
"""
|
1559 |
Extract structured data using text-only model with detailed historical context prompting
|
1560 |
+
and improved error handling with enhanced language detection
|
1561 |
"""
|
1562 |
logger = logging.getLogger("text_processor")
|
1563 |
start_time = time.time()
|
|
|
1566 |
# Fast path: Skip for minimal OCR text
|
1567 |
if not ocr_markdown or len(ocr_markdown.strip()) < 50:
|
1568 |
logger.info("Minimal OCR text - returning basic result")
|
1569 |
+
|
1570 |
+
# Attempt comprehensive language detection even for minimal text
|
1571 |
+
detected_languages = []
|
1572 |
+
|
1573 |
+
# Simple language detection based on character frequency
|
1574 |
+
if ocr_markdown and len(ocr_markdown.strip()) > 10:
|
1575 |
+
# Define indicators for all supported languages
|
1576 |
+
language_indicators = {
|
1577 |
+
"Portuguese": {
|
1578 |
+
"chars": ['ã', 'õ', 'á', 'é', 'ê', 'í', 'ó', 'ú', 'ç'],
|
1579 |
+
"words": ['e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com']
|
1580 |
+
},
|
1581 |
+
"Spanish": {
|
1582 |
+
"chars": ['ñ', 'á', 'é', 'í', 'ó', 'ú', '¿', '¡'],
|
1583 |
+
"words": ['el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con', 'del']
|
1584 |
+
},
|
1585 |
+
"French": {
|
1586 |
+
"chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û'],
|
1587 |
+
"words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une']
|
1588 |
+
},
|
1589 |
+
"German": {
|
1590 |
+
"chars": ['ä', 'ö', 'ü', 'ß'],
|
1591 |
+
"words": ['der', 'die', 'das', 'und', 'ist', 'von', 'mit', 'für', 'sich']
|
1592 |
+
},
|
1593 |
+
"Italian": {
|
1594 |
+
"chars": ['à', 'è', 'é', 'ì', 'ò', 'ù'],
|
1595 |
+
"words": ['il', 'la', 'e', 'di', 'che', 'per', 'con', 'sono', 'non']
|
1596 |
+
},
|
1597 |
+
"Latin": {
|
1598 |
+
"chars": [],
|
1599 |
+
"words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod']
|
1600 |
+
}
|
1601 |
+
}
|
1602 |
+
|
1603 |
+
words = ocr_markdown.lower().split()
|
1604 |
+
|
1605 |
+
# Check for indicators of each language
|
1606 |
+
for language, indicators in language_indicators.items():
|
1607 |
+
chars = indicators["chars"]
|
1608 |
+
lang_words = indicators["words"]
|
1609 |
+
|
1610 |
+
has_chars = any(char in ocr_markdown for char in chars) if chars else False
|
1611 |
+
word_count = sum(1 for word in words if word in lang_words)
|
1612 |
+
|
1613 |
+
# Add language if strong enough indicators are present
|
1614 |
+
if has_chars or word_count >= 2:
|
1615 |
+
detected_languages.append(language)
|
1616 |
+
|
1617 |
+
# Check for English separately
|
1618 |
+
english_words = ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it']
|
1619 |
+
english_count = sum(1 for word in words if word in english_words)
|
1620 |
+
if english_count >= 2:
|
1621 |
+
detected_languages.append("English")
|
1622 |
+
|
1623 |
+
# If no languages detected, default to English
|
1624 |
+
if not detected_languages:
|
1625 |
+
detected_languages = ["English"]
|
1626 |
+
|
1627 |
return {
|
1628 |
"file_name": filename,
|
1629 |
"topics": ["Document"],
|
1630 |
+
"languages": detected_languages,
|
1631 |
"ocr_contents": {
|
1632 |
"raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
|
1633 |
},
|
|
|
1648 |
"processing_method": "test_mode"
|
1649 |
}
|
1650 |
|
|
|
|
|
|
|
|
|
1651 |
# If OCR text is very large, truncate it to avoid API limits
|
1652 |
truncated_text = ocr_markdown
|
1653 |
if len(ocr_markdown) > 25000:
|
|
|
1655 |
truncated_text = ocr_markdown[:15000] + "\n...[content truncated]...\n" + ocr_markdown[-5000:]
|
1656 |
logger.info(f"OCR text truncated from {len(ocr_markdown)} to {len(truncated_text)} chars")
|
1657 |
|
1658 |
+
# Build a prompt with enhanced language detection instructions
|
1659 |
+
enhanced_prompt = f"This is a document's OCR text:\n<BEGIN_OCR>\n{truncated_text}\n<END_OCR>\n\n"
|
1660 |
+
|
1661 |
+
# Add custom prompt if provided
|
1662 |
+
if custom_prompt:
|
1663 |
+
enhanced_prompt += f"User instructions: {custom_prompt}\n\n"
|
1664 |
+
|
1665 |
+
# Add thorough extraction instructions with enhanced language detection and metadata requirements
|
1666 |
+
enhanced_prompt += "Extract all text content accurately from this document. Return structured data with the document's contents.\n\n"
|
1667 |
+
enhanced_prompt += "IMPORTANT: Precisely identify and list ALL languages present in the document separately. Look closely for multiple languages that might appear together.\n"
|
1668 |
+
enhanced_prompt += "For language detection, examine these specific indicators:\n"
|
1669 |
+
enhanced_prompt += "- French: accents (é, è, ê, à, ç), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du'\n"
|
1670 |
+
enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
|
1671 |
+
enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en'\n"
|
1672 |
+
enhanced_prompt += "- Italian: words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
|
1673 |
+
enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
|
1674 |
+
enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n"
|
1675 |
+
enhanced_prompt += "Do NOT classify text as English unless you can positively confirm it contains specifically English words and phrases.\n\n"
|
1676 |
+
enhanced_prompt += "Return ALL detected languages as separate entries in the languages array. If multiple languages are present, list them ALL separately."
|
1677 |
|
1678 |
# Use enhanced prompt with text-only model - with retry logic
|
1679 |
max_retries = 2
|
|
|
1711 |
if 'languages' in result:
|
1712 |
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
1713 |
|
1714 |
+
# Add simplified processing metadata
|
1715 |
result['processing_method'] = 'text_model'
|
|
|
1716 |
result['model_used'] = TEXT_MODEL
|
1717 |
result['processing_time'] = time.time() - start_time
|
1718 |
|
1719 |
# Flag when custom prompt has been successfully applied
|
1720 |
if custom_prompt:
|
1721 |
result['custom_prompt_applied'] = 'text_model'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1722 |
|
1723 |
# Add raw text for reference if not already present
|
1724 |
if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
|
|
|
1781 |
"processing_time": time.time() - start_time
|
1782 |
}
|
1783 |
|
1784 |
+
# No topic detection to avoid issue with document misclassification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1785 |
|
1786 |
except Exception as inner_e:
|
1787 |
logger.error(f"Error creating basic result: {str(inner_e)}")
|
|
|
1809 |
processor = StructuredOCR()
|
1810 |
result = processor.process_file(file_path)
|
1811 |
|
1812 |
+
print(json.dumps(result, indent=2))
|
ui/layout.py
CHANGED
@@ -1,217 +1,339 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
def load_css():
|
4 |
-
"""Load custom CSS for the application"""
|
5 |
st.markdown("""
|
6 |
<style>
|
7 |
-
/* Global styles */
|
8 |
-
|
9 |
-
|
10 |
-
color: #333;
|
11 |
}
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
h1, h2, h3, h4, h5, h6 {
|
15 |
-
font-family: '
|
16 |
font-weight: 600;
|
17 |
-
color: #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
}
|
19 |
|
20 |
-
/*
|
21 |
-
.
|
22 |
-
|
23 |
-
|
24 |
-
background-color: #E3F2FD;
|
25 |
-
border-radius: 0 4px 4px 0;
|
26 |
-
margin: 10px 0;
|
27 |
-
font-size: 14px;
|
28 |
}
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
}
|
34 |
|
35 |
-
.
|
36 |
-
background
|
37 |
-
border-radius:
|
38 |
-
padding: 15px;
|
39 |
-
margin-bottom: 15px;
|
40 |
-
border: 1px solid #e0e0e0;
|
41 |
-
transition: all 0.2s ease;
|
42 |
}
|
43 |
|
44 |
-
.
|
45 |
-
|
46 |
-
border-color: #c0c0c0;
|
47 |
}
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
|
55 |
-
.
|
56 |
-
|
57 |
-
font-size:
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
}
|
59 |
|
60 |
-
.
|
61 |
-
|
62 |
-
|
|
|
|
|
63 |
}
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
68 |
}
|
69 |
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
71 |
margin-bottom: 5px;
|
72 |
-
color: #555;
|
73 |
}
|
74 |
|
75 |
-
.
|
76 |
-
|
77 |
-
|
78 |
}
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
background-color: #f0f2f6;
|
84 |
-
border-radius: 8px;
|
85 |
-
border: 1px solid #d0d7de;
|
86 |
}
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
}
|
93 |
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
95 |
.stTextArea textarea {
|
96 |
-
font-family: '
|
97 |
-
font-size:
|
98 |
-
line-height:
|
|
|
99 |
}
|
100 |
|
101 |
-
/*
|
102 |
-
.
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
}
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
110 |
}
|
111 |
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
}
|
115 |
|
116 |
-
/*
|
117 |
-
.
|
118 |
-
|
119 |
}
|
120 |
|
121 |
-
|
122 |
-
|
|
|
123 |
}
|
124 |
|
125 |
-
/*
|
126 |
.sidebar .block-container {
|
127 |
-
padding-top: 0;
|
128 |
}
|
129 |
|
130 |
-
.sidebar
|
131 |
-
|
132 |
-
}
|
133 |
-
|
134 |
-
/* Button styling */
|
135 |
-
.stButton > button {
|
136 |
-
border-radius: 4px;
|
137 |
-
font-weight: 600;
|
138 |
}
|
139 |
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
}
|
144 |
|
145 |
-
/*
|
146 |
-
.
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
.stFileUploader label p,
|
151 |
-
.stFileUploader label span,
|
152 |
-
.stFileUploader div[data-testid="stFileUploadDropzone"] p,
|
153 |
-
.stFileUploader div[data-testid="stFileUploadDropzone"] span {
|
154 |
-
writing-mode: horizontal-tb !important;
|
155 |
}
|
156 |
|
157 |
-
/*
|
158 |
-
.
|
159 |
-
|
160 |
-
|
161 |
-
padding: 15px;
|
162 |
-
margin-bottom: 20px;
|
163 |
-
border: 1px solid #e0e0e0;
|
164 |
}
|
165 |
|
166 |
-
/*
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
169 |
}
|
170 |
|
171 |
-
/*
|
172 |
-
|
173 |
-
|
|
|
174 |
}
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
179 |
}
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
padding: 10px;
|
186 |
-
border-radius: 4px;
|
187 |
-
border-left: 5px solid #155724;
|
188 |
}
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
padding: 10px;
|
195 |
-
border-radius: 4px;
|
196 |
-
border-left: 5px solid #721C24;
|
197 |
}
|
198 |
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
padding: 10px;
|
204 |
-
border-radius: 4px;
|
205 |
-
border-left: 5px solid #0C5460;
|
206 |
}
|
207 |
|
208 |
-
/*
|
209 |
-
|
210 |
-
|
211 |
-
color: #856404;
|
212 |
-
padding: 10px;
|
213 |
-
border-radius: 4px;
|
214 |
-
border-left: 5px solid #856404;
|
215 |
}
|
216 |
</style>
|
217 |
""", unsafe_allow_html=True)
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
def load_css():
|
4 |
+
"""Load custom CSS for the application - inspired by mistral-ocr implementations"""
|
5 |
st.markdown("""
|
6 |
<style>
|
7 |
+
/* Global styles - clean, modern approach with consistent line height */
|
8 |
+
:root {
|
9 |
+
--standard-line-height: 1.5;
|
|
|
10 |
}
|
11 |
|
12 |
+
body {
|
13 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
|
14 |
+
color: #111827;
|
15 |
+
line-height: var(--standard-line-height);
|
16 |
+
}
|
17 |
+
|
18 |
+
/* Remove all container backgrounds that cause the white box issue */
|
19 |
+
div[data-testid="stMarkdownContainer"],
|
20 |
+
div[data-testid="stText"],
|
21 |
+
div.stMarkdown,
|
22 |
+
.stText > div:first-child,
|
23 |
+
.element-container > div,
|
24 |
+
div[data-testid="column"] > div > div > div {
|
25 |
+
background-color: transparent !important;
|
26 |
+
box-shadow: none !important;
|
27 |
+
border: none !important;
|
28 |
+
border-radius: 0 !important;
|
29 |
+
padding: 0 !important;
|
30 |
+
margin: 0 !important;
|
31 |
+
}
|
32 |
+
|
33 |
+
/* Base text styling with standardized line height */
|
34 |
+
div[data-testid="stMarkdownContainer"] > p {
|
35 |
+
margin: 0 0 0.3rem 0 !important;
|
36 |
+
padding: 0 !important;
|
37 |
+
font-size: 0.95rem !important;
|
38 |
+
line-height: var(--standard-line-height) !important;
|
39 |
+
}
|
40 |
+
|
41 |
+
/* Move content to top of columns with minimal padding */
|
42 |
+
[data-testid="column"] {
|
43 |
+
align-items: flex-start !important;
|
44 |
+
padding: 0 0.5rem !important;
|
45 |
+
gap: 0.5rem !important;
|
46 |
+
}
|
47 |
+
|
48 |
+
/* Clean minimal heading styles with better line height */
|
49 |
h1, h2, h3, h4, h5, h6 {
|
50 |
+
font-family: 'Inter', system-ui, sans-serif;
|
51 |
font-weight: 600;
|
52 |
+
color: #111827;
|
53 |
+
margin: 0.4rem 0 0.2rem 0 !important;
|
54 |
+
padding: 0 !important;
|
55 |
+
background-color: transparent !important;
|
56 |
+
line-height: 1.3 !important; /* Slightly increased for headings but still compact */
|
57 |
+
}
|
58 |
+
|
59 |
+
/* Simple section headers with subtle styling */
|
60 |
+
.block-container [data-testid="column"] h4 {
|
61 |
+
font-size: 0.95rem !important;
|
62 |
+
font-weight: 600 !important;
|
63 |
+
color: #374151 !important;
|
64 |
+
border-bottom: 1px solid #e5e7eb;
|
65 |
+
padding-bottom: 0.15rem !important;
|
66 |
+
margin-bottom: 0.25rem !important;
|
67 |
+
}
|
68 |
+
|
69 |
+
/* Reduce whitespace between elements */
|
70 |
+
.element-container {
|
71 |
+
margin-bottom: 0.2rem !important;
|
72 |
+
}
|
73 |
+
|
74 |
+
/* OCR text container with improved contrast and styling */
|
75 |
+
.ocr-text-container {
|
76 |
+
font-family: 'Inter', system-ui, sans-serif;
|
77 |
+
font-size: 0.95rem;
|
78 |
+
line-height: var(--standard-line-height); /* Consistent line height */
|
79 |
+
color: #111827;
|
80 |
+
margin-bottom: 0.4rem;
|
81 |
+
max-height: 600px;
|
82 |
+
overflow-y: auto;
|
83 |
+
background-color: transparent;
|
84 |
+
padding: 6px 10px;
|
85 |
+
border-radius: 4px;
|
86 |
+
border: 1px solid #e2e8f0;
|
87 |
}
|
88 |
|
89 |
+
/* Custom scrollbar styling */
|
90 |
+
.ocr-text-container::-webkit-scrollbar {
|
91 |
+
width: 6px;
|
92 |
+
height: 6px;
|
|
|
|
|
|
|
|
|
93 |
}
|
94 |
|
95 |
+
.ocr-text-container::-webkit-scrollbar-track {
|
96 |
+
background: #f1f1f1;
|
97 |
+
border-radius: 3px;
|
98 |
}
|
99 |
|
100 |
+
.ocr-text-container::-webkit-scrollbar-thumb {
|
101 |
+
background: #c1c1c1;
|
102 |
+
border-radius: 3px;
|
|
|
|
|
|
|
|
|
103 |
}
|
104 |
|
105 |
+
.ocr-text-container::-webkit-scrollbar-thumb:hover {
|
106 |
+
background: #a0a0a0;
|
|
|
107 |
}
|
108 |
|
109 |
+
/* Styling for all expanders/accordions */
|
110 |
+
.st-expander,
|
111 |
+
details.streamlit-expanderHeader {
|
112 |
+
border: 1px solid #e5e7eb !important;
|
113 |
+
border-radius: 4px !important;
|
114 |
+
box-shadow: none !important;
|
115 |
+
background-color: transparent !important;
|
116 |
+
margin-bottom: 6px !important;
|
117 |
}
|
118 |
|
119 |
+
.st-expanderHeader,
|
120 |
+
summary.streamlit-expanderHeader {
|
121 |
+
font-size: 0.95rem !important;
|
122 |
+
font-weight: 600 !important;
|
123 |
+
color: #374151 !important;
|
124 |
+
padding: 0.4rem 0.6rem !important;
|
125 |
+
background-color: rgba(241, 245, 249, 0.5) !important;
|
126 |
+
border-bottom: 1px solid #e5e7eb !important;
|
127 |
+
border-radius: 3px 3px 0 0 !important;
|
128 |
}
|
129 |
|
130 |
+
.st-expanderContent,
|
131 |
+
details[open] > div:nth-child(2) {
|
132 |
+
border-top: none !important;
|
133 |
+
padding: 0.4rem 0.6rem !important;
|
134 |
+
background-color: transparent !important;
|
135 |
}
|
136 |
|
137 |
+
/* Set expander text to have good contrast */
|
138 |
+
.st-expanderContent p,
|
139 |
+
.st-expanderContent li,
|
140 |
+
.st-expanderContent span {
|
141 |
+
color: #1f2937 !important;
|
142 |
}
|
143 |
|
144 |
+
/* Streamlined OCR image display */
|
145 |
+
.ocr-image-container {
|
146 |
+
border: 1px solid #e2e8f0;
|
147 |
+
border-radius: 4px;
|
148 |
+
padding: 0;
|
149 |
+
background-color: transparent;
|
150 |
margin-bottom: 5px;
|
|
|
151 |
}
|
152 |
|
153 |
+
.ocr-image-container img {
|
154 |
+
border-radius: 4px;
|
155 |
+
width: 100%;
|
156 |
}
|
157 |
|
158 |
+
/* Subtle document sections */
|
159 |
+
.document-section {
|
160 |
+
margin-bottom: 0.4rem !important;
|
|
|
|
|
|
|
161 |
}
|
162 |
|
163 |
+
/* Compact tag styling */
|
164 |
+
.subject-tag {
|
165 |
+
display: inline-block;
|
166 |
+
padding: 0.1rem 0.4rem;
|
167 |
+
border-radius: 3px;
|
168 |
+
font-size: 0.7rem;
|
169 |
+
margin: 0 0.2rem 0.2rem 0;
|
170 |
+
background-color: #f3f4f6;
|
171 |
+
color: #374151;
|
172 |
+
border: 1px solid #e5e7eb;
|
173 |
}
|
174 |
|
175 |
+
.tag-time-period { color: #1e40af; background-color: #eff6ff; border-color: #bfdbfe; }
|
176 |
+
.tag-language { color: #065f46; background-color: #ecfdf5; border-color: #a7f3d0; }
|
177 |
+
.tag-document-type { color: #5b21b6; background-color: #f5f3ff; border-color: #ddd6fe; }
|
178 |
+
.tag-subject { color: #166534; background-color: #f0fdf4; border-color: #bbf7d0; }
|
179 |
+
|
180 |
+
/* Clean text area */
|
181 |
.stTextArea textarea {
|
182 |
+
font-family: 'Roboto Mono', monospace;
|
183 |
+
font-size: 0.9rem;
|
184 |
+
line-height: var(--standard-line-height); /* Consistent line height */
|
185 |
+
padding: 0.5rem;
|
186 |
}
|
187 |
|
188 |
+
/* Button styling - fixed for text overflow issues */
|
189 |
+
.stButton > button {
|
190 |
+
border-radius: 4px;
|
191 |
+
font-weight: 400;
|
192 |
+
line-height: var(--standard-line-height);
|
193 |
+
padding: 0.4rem 0.75rem !important;
|
194 |
+
margin: 0;
|
195 |
+
min-width: 150px !important; /* Increased minimum width */
|
196 |
+
white-space: normal !important; /* Allow text to wrap if needed */
|
197 |
+
overflow: visible !important; /* Ensure text doesn't get cut off */
|
198 |
+
height: auto !important; /* Allow height to adjust as needed */
|
199 |
+
text-overflow: clip !important; /* Don't clip text */
|
200 |
+
display: inline-block !important; /* Better content handling */
|
201 |
+
text-align: center !important; /* Center text */
|
202 |
+
}
|
203 |
+
|
204 |
+
/* Fix button text alignment and prevent truncation */
|
205 |
+
.stButton > button > div,
|
206 |
+
.stButton > button span,
|
207 |
+
.stButton > button p {
|
208 |
+
display: inline-block !important;
|
209 |
+
align-items: center;
|
210 |
+
white-space: normal !important;
|
211 |
+
overflow: visible !important;
|
212 |
+
width: auto !important;
|
213 |
+
text-overflow: clip !important;
|
214 |
+
word-wrap: normal !important;
|
215 |
+
}
|
216 |
+
|
217 |
+
/* Fix for all action buttons in the application */
|
218 |
+
[data-testid="stHorizontalBlock"] button,
|
219 |
+
button[key="close_document_btn"],
|
220 |
+
button[key="process_document_btn"],
|
221 |
+
button[key="load_sample_btn"],
|
222 |
+
button[key="view_btn"],
|
223 |
+
.stDownloadButton button,
|
224 |
+
button[key*="copy_btn"],
|
225 |
+
button[key*="download_btn"],
|
226 |
+
button[key*="view_"] {
|
227 |
+
width: auto !important;
|
228 |
+
min-width: 150px !important;
|
229 |
+
max-width: none !important;
|
230 |
+
display: inline-block !important;
|
231 |
+
white-space: normal !important;
|
232 |
+
overflow: visible !important;
|
233 |
+
text-align: center !important;
|
234 |
+
text-overflow: clip !important;
|
235 |
+
word-break: normal !important;
|
236 |
+
padding: 0.4rem 0.75rem !important;
|
237 |
}
|
238 |
+
|
239 |
+
/* Ensure text doesn't wrap awkwardly for buttons */
|
240 |
+
button span p {
|
241 |
+
margin: 0 !important;
|
242 |
+
padding: 0 !important;
|
243 |
+
white-space: normal !important;
|
244 |
+
overflow: visible !important;
|
245 |
}
|
246 |
|
247 |
+
/* Extra button container fixes for all button types */
|
248 |
+
.stButton, .stDownloadButton, [data-testid="stDownloadButton"] {
|
249 |
+
width: auto !important;
|
250 |
+
min-width: 150px !important;
|
251 |
+
overflow: visible !important;
|
252 |
+
display: block !important;
|
253 |
+
background-color: white;
|
254 |
+
border: 1px solid #ddd;
|
255 |
+
box-shadow: none !important;
|
256 |
}
|
257 |
|
258 |
+
/* Ensure consistent spacing in widgets */
|
259 |
+
.row-widget {
|
260 |
+
padding: 0.15rem 0 !important;
|
261 |
}
|
262 |
|
263 |
+
/* Fix spacing in expanders */
|
264 |
+
.stExpander > .streamlit-expanderContent > div {
|
265 |
+
padding-top: 0.15rem !important;
|
266 |
}
|
267 |
|
268 |
+
/* Optimized sidebar */
|
269 |
.sidebar .block-container {
|
270 |
+
padding-top: 0.6rem;
|
271 |
}
|
272 |
|
273 |
+
.sidebar .stRadio > div {
|
274 |
+
flex-direction: row;
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
}
|
276 |
|
277 |
+
.sidebar .stRadio label {
|
278 |
+
margin-right: 0.75rem;
|
279 |
+
font-size: 0.9rem;
|
280 |
}
|
281 |
|
282 |
+
/* Clean alert styles */
|
283 |
+
.stSuccess, .stError, .stInfo, .stWarning {
|
284 |
+
border-radius: 4px;
|
285 |
+
padding: 0.3rem 0.6rem;
|
286 |
+
margin: 0.2rem 0;
|
|
|
|
|
|
|
|
|
|
|
287 |
}
|
288 |
|
289 |
+
/* Fix any remaining spacing issues */
|
290 |
+
div.element-container > div > div {
|
291 |
+
margin: 0 !important;
|
292 |
+
line-height: var(--standard-line-height); /* Ensure consistent line height */
|
|
|
|
|
|
|
293 |
}
|
294 |
|
295 |
+
/* Fix column layouts for button containers */
|
296 |
+
[data-testid="column"] > div:has(.stButton) {
|
297 |
+
display: flex;
|
298 |
+
justify-content: flex-start;
|
299 |
+
align-items: center;
|
300 |
+
min-height: 38px; /* Match standard button height */
|
301 |
}
|
302 |
|
303 |
+
/* Fix for tabs being cut off at the top of the page */
|
304 |
+
/* Main container adjustments to avoid header overlap */
|
305 |
+
.main .block-container {
|
306 |
+
padding-top: 3rem !important; /* Increased top padding to make room for Streamlit header */
|
307 |
}
|
308 |
|
309 |
+
[data-testid="stTabs"] {
|
310 |
+
width: 100%;
|
311 |
+
overflow-x: visible !important;
|
312 |
+
position: relative;
|
313 |
+
z-index: 1; /* Ensure tabs are on the right layer */
|
314 |
}
|
315 |
|
316 |
+
[data-testid="stTabs"] > div:first-child {
|
317 |
+
padding-left: 0.5rem;
|
318 |
+
padding-right: 0.5rem;
|
319 |
+
overflow-x: visible !important;
|
|
|
|
|
|
|
320 |
}
|
321 |
|
322 |
+
[data-testid="stTabs"] [role="tab"] {
|
323 |
+
padding: 0.5rem 1rem;
|
324 |
+
min-width: fit-content;
|
325 |
+
white-space: nowrap;
|
|
|
|
|
|
|
326 |
}
|
327 |
|
328 |
+
[data-testid="stTabs"] [role="tablist"] {
|
329 |
+
overflow-x: visible !important;
|
330 |
+
flex-wrap: nowrap;
|
331 |
+
margin-top: 1rem; /* Add a bit more space at the top */
|
|
|
|
|
|
|
332 |
}
|
333 |
|
334 |
+
/* Fix header overlap issues */
|
335 |
+
header[data-testid="stHeader"] {
|
336 |
+
z-index: 999 !important; /* Keep header on top */
|
|
|
|
|
|
|
|
|
337 |
}
|
338 |
</style>
|
339 |
""", unsafe_allow_html=True)
|
ui_components.py
CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
|
|
2 |
import os
|
3 |
import io
|
4 |
import base64
|
|
|
5 |
from datetime import datetime
|
6 |
from pathlib import Path
|
7 |
import json
|
@@ -64,16 +65,16 @@ class ProgressReporter:
|
|
64 |
def create_sidebar_options():
|
65 |
"""Create and return sidebar options"""
|
66 |
with st.sidebar:
|
67 |
-
st.
|
68 |
|
69 |
# Create a container for the sidebar options
|
70 |
with st.container():
|
71 |
# Model selection
|
72 |
-
st.
|
73 |
use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
|
74 |
|
75 |
# Document type selection
|
76 |
-
st.
|
77 |
doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
|
78 |
help="Select the type of document you're processing for better results")
|
79 |
|
@@ -100,7 +101,7 @@ def create_sidebar_options():
|
|
100 |
st.markdown("**Custom Processing Instructions**")
|
101 |
custom_prompt = st.text_area("", value=custom_prompt,
|
102 |
help="Customize the instructions for processing this document",
|
103 |
-
height=
|
104 |
|
105 |
# Image preprocessing options in an expandable section
|
106 |
with st.expander("Image Preprocessing"):
|
@@ -131,8 +132,17 @@ def create_sidebar_options():
|
|
131 |
help="Rotate image if needed")
|
132 |
|
133 |
# Create preprocessing options dictionary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
preprocessing_options = {
|
135 |
-
"document_type":
|
136 |
"grayscale": grayscale,
|
137 |
"denoise": denoise,
|
138 |
"contrast": contrast,
|
@@ -141,23 +151,15 @@ def create_sidebar_options():
|
|
141 |
|
142 |
# PDF-specific options in an expandable section
|
143 |
with st.expander("PDF Options"):
|
144 |
-
pdf_dpi = st.slider("PDF Resolution (DPI)",
|
145 |
-
min_value=MIN_PDF_DPI,
|
146 |
-
max_value=MAX_PDF_DPI,
|
147 |
-
value=DEFAULT_PDF_DPI,
|
148 |
-
step=25,
|
149 |
-
help="Higher DPI gives better quality but slower processing")
|
150 |
-
|
151 |
max_pages = st.number_input("Maximum Pages to Process",
|
152 |
min_value=1,
|
153 |
max_value=20,
|
154 |
value=DEFAULT_MAX_PAGES,
|
155 |
help="Limit the number of pages to process (for multi-page PDFs)")
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
help="Rotate PDF pages if needed")
|
161 |
|
162 |
# Create options dictionary
|
163 |
options = {
|
@@ -175,28 +177,23 @@ def create_sidebar_options():
|
|
175 |
def create_file_uploader():
|
176 |
"""Create and return a file uploader"""
|
177 |
# Add app description
|
178 |
-
|
179 |
-
|
180 |
-
st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><img src="data:image/png;base64,{favicon_base64}" width="36" height="36" alt="Scroll Icon"/> <div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical Document OCR</h2></div></div>', unsafe_allow_html=True)
|
181 |
-
st.markdown("<p style='font-size: 0.8em; color: #666; text-align: right;'>Made possible by Mistral AI</p>", unsafe_allow_html=True)
|
182 |
|
183 |
# Add project framing
|
184 |
st.markdown("""
|
185 |
-
This tool
|
186 |
-
|
187 |
-
|
|
|
188 |
|
189 |
-
|
190 |
-
- **Handwritten documents** from various time periods
|
191 |
-
- **Photos of archival materials** that may be difficult to read
|
192 |
-
|
193 |
-
Upload a document to get started, or explore the example documents.
|
194 |
""")
|
195 |
|
196 |
-
# Create file uploader
|
197 |
uploaded_file = st.file_uploader(
|
198 |
-
"
|
199 |
-
type=["pdf", "png", "jpg"
|
200 |
help="Upload a PDF or image file for OCR processing"
|
201 |
)
|
202 |
return uploaded_file
|
@@ -204,136 +201,407 @@ def create_file_uploader():
|
|
204 |
def display_results(result, container, custom_prompt=""):
|
205 |
"""Display OCR results in the provided container"""
|
206 |
with container:
|
207 |
-
#
|
208 |
-
|
|
|
|
|
209 |
|
210 |
-
#
|
211 |
-
|
|
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
217 |
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
-
#
|
233 |
if 'topics' in result and result['topics']:
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
|
|
242 |
|
243 |
-
#
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
badge_color = "#00695c" # Teal for languages
|
248 |
-
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
|
249 |
-
badge_color = "#6a1b9a" # Purple for document types
|
250 |
-
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
251 |
-
badge_color = "#2e7d32" # Green for subject domains
|
252 |
-
elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
|
253 |
-
badge_color = "#e65100" # Orange for preprocessing-related tags
|
254 |
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
-
#
|
265 |
-
st.subheader("OCR Content")
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
|
|
|
|
|
|
275 |
|
|
|
276 |
with content_tab1:
|
277 |
-
# Display structured content
|
278 |
if isinstance(result['ocr_contents'], dict):
|
279 |
-
|
280 |
-
|
281 |
-
|
|
|
|
|
|
|
|
|
282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
if isinstance(content, str):
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
elif isinstance(content, list):
|
|
|
|
|
286 |
for item in content:
|
287 |
if isinstance(item, str):
|
288 |
-
|
|
|
|
|
|
|
289 |
else:
|
290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
elif isinstance(content, dict):
|
|
|
|
|
292 |
for k, v in content.items():
|
293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
raw_text = result['ocr_contents']['content']
|
302 |
-
|
303 |
-
# Allow editing of the raw text
|
304 |
-
edited_text = st.text_area("Edit Raw Text", raw_text, height=400)
|
305 |
-
|
306 |
-
# Add a button to copy the edited text to clipboard
|
307 |
-
if st.button("Copy to Clipboard"):
|
308 |
-
st.success("Text copied to clipboard! (You can paste it elsewhere)")
|
309 |
-
# Note: The actual clipboard functionality is handled by the browser
|
310 |
-
|
311 |
-
# Add a download button for the edited text
|
312 |
-
st.download_button(
|
313 |
-
label="Download Edited Text",
|
314 |
-
data=edited_text,
|
315 |
-
file_name=f"{result.get('file_name', 'document').split('.')[0]}_edited.txt",
|
316 |
-
mime="text/plain"
|
317 |
-
)
|
318 |
|
319 |
-
|
320 |
-
with content_tab3:
|
321 |
-
# Use the display_document_with_images function
|
322 |
-
display_document_with_images(result)
|
323 |
-
|
324 |
-
# Display custom prompt if provided
|
325 |
-
if custom_prompt:
|
326 |
-
with st.expander("Custom Processing Instructions"):
|
327 |
-
st.write(custom_prompt)
|
328 |
-
|
329 |
-
# Add download buttons
|
330 |
-
st.subheader("Download Results")
|
331 |
-
|
332 |
-
# Create columns for download buttons
|
333 |
-
download_col1, download_col2 = st.columns(2)
|
334 |
-
|
335 |
-
with download_col1:
|
336 |
-
# JSON download
|
337 |
try:
|
338 |
json_str = json.dumps(result, indent=2)
|
339 |
st.download_button(
|
@@ -344,8 +612,7 @@ def display_results(result, container, custom_prompt=""):
|
|
344 |
)
|
345 |
except Exception as e:
|
346 |
st.error(f"Error creating JSON download: {str(e)}")
|
347 |
-
|
348 |
-
with download_col2:
|
349 |
# Text download
|
350 |
try:
|
351 |
if 'ocr_contents' in result:
|
@@ -369,314 +636,319 @@ def display_results(result, container, custom_prompt=""):
|
|
369 |
|
370 |
def display_document_with_images(result):
|
371 |
"""Display document with images"""
|
372 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
st.info("No image data available.")
|
374 |
return
|
375 |
|
376 |
# Display each page
|
377 |
-
for i, page_data in enumerate(
|
378 |
st.markdown(f"### Page {i+1}")
|
379 |
|
380 |
# Create columns for image and text
|
381 |
img_col, text_col = st.columns([1, 1])
|
382 |
|
383 |
with img_col:
|
384 |
-
# Display the image
|
|
|
|
|
|
|
385 |
if 'image_data' in page_data:
|
386 |
try:
|
387 |
# Convert base64 to image
|
388 |
image_data = base64.b64decode(page_data['image_data'])
|
389 |
st.image(io.BytesIO(image_data), use_container_width=True)
|
|
|
390 |
except Exception as e:
|
391 |
-
st.error(f"Error displaying image: {str(e)}")
|
392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
st.info("No image available for this page.")
|
394 |
|
395 |
with text_col:
|
396 |
-
#
|
|
|
397 |
if 'text' in page_data:
|
398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
|
400 |
-
# Add a button to copy the edited text to clipboard
|
401 |
-
|
402 |
-
st.success(f"Page {i+1} text copied to clipboard!")
|
403 |
else:
|
404 |
st.info("No text available for this page.")
|
405 |
|
406 |
def display_previous_results():
|
407 |
-
"""Display previous results tab content"""
|
408 |
-
st.markdown('<h2>Previous Results</h2>', unsafe_allow_html=True)
|
409 |
|
410 |
-
#
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
except ImportError:
|
415 |
-
# If ui.layout module is not available, use a simplified version
|
416 |
-
st.markdown("""
|
417 |
-
<style>
|
418 |
-
.previous-results-container {
|
419 |
-
margin-top: 20px;
|
420 |
-
}
|
421 |
-
.result-card {
|
422 |
-
background-color: #f8f9fa;
|
423 |
-
border-radius: 8px;
|
424 |
-
padding: 15px;
|
425 |
-
margin-bottom: 15px;
|
426 |
-
border: 1px solid #e0e0e0;
|
427 |
-
}
|
428 |
-
.result-header {
|
429 |
-
display: flex;
|
430 |
-
justify-content: space-between;
|
431 |
-
margin-bottom: 10px;
|
432 |
-
}
|
433 |
-
.result-filename {
|
434 |
-
font-weight: bold;
|
435 |
-
font-size: 16px;
|
436 |
-
}
|
437 |
-
.result-date {
|
438 |
-
color: #666;
|
439 |
-
font-size: 14px;
|
440 |
-
}
|
441 |
-
.result-metadata {
|
442 |
-
margin-top: 10px;
|
443 |
-
font-size: 14px;
|
444 |
-
}
|
445 |
-
.result-tag {
|
446 |
-
margin-bottom: 5px;
|
447 |
-
color: #555;
|
448 |
-
}
|
449 |
-
.result-action-button {
|
450 |
-
margin-top: 10px;
|
451 |
-
text-align: right;
|
452 |
-
}
|
453 |
-
.selected-result-container {
|
454 |
-
margin-top: 30px;
|
455 |
-
padding: 20px;
|
456 |
-
background-color: #f0f2f6;
|
457 |
-
border-radius: 8px;
|
458 |
-
}
|
459 |
-
.selected-result-title {
|
460 |
-
font-size: 18px;
|
461 |
-
font-weight: bold;
|
462 |
-
}
|
463 |
-
</style>
|
464 |
-
""", unsafe_allow_html=True)
|
465 |
|
466 |
# Display previous results if available
|
467 |
if not st.session_state.previous_results:
|
468 |
st.markdown("""
|
469 |
-
<div
|
470 |
-
<div style="font-size:
|
471 |
-
<
|
472 |
-
<p style="font-size:
|
473 |
</div>
|
474 |
""", unsafe_allow_html=True)
|
475 |
else:
|
476 |
-
#
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
with filter_col:
|
484 |
-
# Add filter options
|
485 |
-
filter_options = ["All Types"]
|
486 |
-
if any(result.get("file_name", "").lower().endswith(".pdf") for result in st.session_state.previous_results):
|
487 |
-
filter_options.append("PDF Documents")
|
488 |
-
if any(result.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png")) for result in st.session_state.previous_results):
|
489 |
-
filter_options.append("Images")
|
490 |
|
491 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
|
|
506 |
|
507 |
-
#
|
508 |
-
|
509 |
-
|
510 |
|
511 |
-
#
|
512 |
-
if
|
513 |
-
|
514 |
-
elif
|
515 |
-
|
516 |
-
elif img_count > 0:
|
517 |
-
zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip"
|
518 |
else:
|
519 |
-
|
520 |
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
# Show a message if no results match the filter
|
540 |
-
if not filtered_results:
|
541 |
-
st.markdown("""
|
542 |
-
<div style="text-align: center; padding: 20px; background-color: #f9f9f9; border-radius: 5px; margin: 20px 0;">
|
543 |
-
<p>No results match the selected filter.</p>
|
544 |
-
</div>
|
545 |
-
""", unsafe_allow_html=True)
|
546 |
-
|
547 |
-
# Display each result as a card
|
548 |
-
for i, result in enumerate(filtered_results):
|
549 |
-
# Determine file type icon
|
550 |
-
file_name = result.get("file_name", f"Document {i+1}")
|
551 |
-
file_type_lower = file_name.lower()
|
552 |
-
|
553 |
-
if file_type_lower.endswith(".pdf"):
|
554 |
-
icon = "📄"
|
555 |
-
elif file_type_lower.endswith((".jpg", ".jpeg", ".png", ".gif")):
|
556 |
-
icon = "🖼️"
|
557 |
-
else:
|
558 |
-
icon = "📝"
|
559 |
-
|
560 |
-
# Create a card for each result
|
561 |
-
st.markdown(f"""
|
562 |
-
<div class="result-card">
|
563 |
-
<div class="result-header">
|
564 |
-
<div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div>
|
565 |
-
<div class="result-date">{result.get('timestamp', 'Unknown')}</div>
|
566 |
-
</div>
|
567 |
-
<div class="result-metadata">
|
568 |
-
<div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
|
569 |
-
<div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div>
|
570 |
-
</div>
|
571 |
-
""", unsafe_allow_html=True)
|
572 |
-
|
573 |
-
# Add view button inside the card with proper styling
|
574 |
-
st.markdown('<div class="result-action-button">', unsafe_allow_html=True)
|
575 |
-
if st.button(f"View Document", key=f"view_{i}"):
|
576 |
-
# Set the selected result in the session state
|
577 |
-
st.session_state.selected_previous_result = st.session_state.previous_results[i]
|
578 |
-
# Force a rerun to show the selected result
|
579 |
-
st.rerun()
|
580 |
-
st.markdown('</div>', unsafe_allow_html=True)
|
581 |
-
|
582 |
-
# Close the result card
|
583 |
-
st.markdown('</div>', unsafe_allow_html=True)
|
584 |
-
|
585 |
-
# Close the container
|
586 |
-
st.markdown('</div>', unsafe_allow_html=True)
|
587 |
|
588 |
# Display the selected result if available
|
589 |
if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
|
590 |
selected_result = st.session_state.selected_previous_result
|
591 |
|
592 |
-
#
|
593 |
-
st.markdown(
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
|
|
|
|
610 |
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
# Generate a badge for each tag
|
618 |
-
for topic in selected_result['topics']:
|
619 |
-
# Create colored badge based on tag category
|
620 |
-
badge_color = "#546e7a" # Default color
|
621 |
-
|
622 |
-
# Assign colors by category
|
623 |
-
if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
|
624 |
-
badge_color = "#1565c0" # Blue for time periods
|
625 |
-
elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
|
626 |
-
badge_color = "#00695c" # Teal for languages
|
627 |
-
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
|
628 |
-
badge_color = "#6a1b9a" # Purple for document types
|
629 |
-
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
630 |
-
badge_color = "#2e7d32" # Green for subject domains
|
631 |
-
elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
|
632 |
-
badge_color = "#e65100" # Orange for preprocessing-related tags
|
633 |
-
|
634 |
-
st.markdown(
|
635 |
-
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
|
636 |
-
f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
|
637 |
-
unsafe_allow_html=True
|
638 |
-
)
|
639 |
-
|
640 |
-
# Close the container
|
641 |
-
st.markdown('</div>', unsafe_allow_html=True)
|
642 |
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages")
|
647 |
|
648 |
-
|
649 |
-
|
650 |
-
st.write(f"**Processing Time:** {proc_time:.1f}s")
|
651 |
|
652 |
-
#
|
653 |
has_images = selected_result.get('has_images', False)
|
654 |
if has_images:
|
655 |
-
|
|
|
656 |
else:
|
657 |
-
|
|
|
658 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
with view_tab1:
|
660 |
-
# Display
|
661 |
if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
662 |
for section, content in selected_result['ocr_contents'].items():
|
663 |
-
if
|
664 |
-
|
|
|
|
|
665 |
|
666 |
if isinstance(content, str):
|
667 |
-
st.
|
668 |
elif isinstance(content, list):
|
669 |
for item in content:
|
670 |
-
|
671 |
-
st.write(f"- {item}")
|
672 |
-
else:
|
673 |
-
st.write(f"- {str(item)}")
|
674 |
elif isinstance(content, dict):
|
675 |
for k, v in content.items():
|
676 |
-
st.
|
677 |
|
|
|
678 |
with view_tab2:
|
679 |
-
#
|
680 |
raw_text = ""
|
681 |
if 'ocr_contents' in selected_result:
|
682 |
if 'raw_text' in selected_result['ocr_contents']:
|
@@ -684,74 +956,91 @@ def display_previous_results():
|
|
684 |
elif 'content' in selected_result['ocr_contents']:
|
685 |
raw_text = selected_result['ocr_contents']['content']
|
686 |
|
687 |
-
#
|
688 |
-
edited_text = st.text_area("
|
689 |
-
|
690 |
-
# Add a button to copy the edited text to clipboard
|
691 |
-
if st.button("Copy to Clipboard", key="selected_copy_btn"):
|
692 |
-
st.success("Text copied to clipboard! (You can paste it elsewhere)")
|
693 |
|
694 |
-
# Add
|
695 |
-
st.
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
|
|
|
|
|
|
|
|
702 |
|
|
|
703 |
if has_images and 'pages_data' in selected_result:
|
704 |
with view_tab3:
|
705 |
-
#
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
717 |
|
718 |
def display_about_tab():
|
719 |
"""Display about tab content"""
|
720 |
-
st.
|
721 |
|
722 |
# Add app description
|
723 |
st.markdown("""
|
724 |
**Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
|
|
|
725 |
|
726 |
-
|
727 |
-
|
|
|
728 |
This tool is designed to assist scholars in historical research by extracting text from challenging documents.
|
729 |
While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
|
730 |
historical documents, particularly:
|
|
|
731 |
|
|
|
732 |
- **Historical newspapers** with complex layouts and aged text
|
733 |
- **Handwritten documents** from various time periods
|
734 |
- **Photos of archival materials** that may be difficult to read
|
|
|
735 |
|
736 |
-
|
737 |
-
|
|
|
738 |
- **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
|
739 |
- **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
|
740 |
- **Editable Results**: Review and edit extracted text directly in the interface
|
741 |
- **Structured Content Analysis**: Automatic organization of document content
|
742 |
- **Multi-language Support**: Process documents in various languages
|
743 |
- **PDF Processing**: Handle multi-page historical documents
|
|
|
744 |
|
745 |
-
|
746 |
-
|
|
|
747 |
1. Upload a document (PDF or image)
|
748 |
2. Select the document type and adjust preprocessing options if needed
|
749 |
3. Add custom processing instructions for specialized documents
|
750 |
4. Process the document
|
751 |
5. Review, edit, and download the results
|
|
|
752 |
|
753 |
-
|
754 |
-
|
|
|
755 |
- OCR processing using Mistral AI's advanced document understanding capabilities
|
756 |
- Image preprocessing with OpenCV
|
757 |
- PDF handling with pdf2image
|
|
|
2 |
import os
|
3 |
import io
|
4 |
import base64
|
5 |
+
import logging
|
6 |
from datetime import datetime
|
7 |
from pathlib import Path
|
8 |
import json
|
|
|
65 |
def create_sidebar_options():
|
66 |
"""Create and return sidebar options"""
|
67 |
with st.sidebar:
|
68 |
+
st.markdown("## OCR Settings")
|
69 |
|
70 |
# Create a container for the sidebar options
|
71 |
with st.container():
|
72 |
# Model selection
|
73 |
+
st.markdown("### Model Selection")
|
74 |
use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
|
75 |
|
76 |
# Document type selection
|
77 |
+
st.markdown("### Document Type")
|
78 |
doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
|
79 |
help="Select the type of document you're processing for better results")
|
80 |
|
|
|
101 |
st.markdown("**Custom Processing Instructions**")
|
102 |
custom_prompt = st.text_area("", value=custom_prompt,
|
103 |
help="Customize the instructions for processing this document",
|
104 |
+
height=80)
|
105 |
|
106 |
# Image preprocessing options in an expandable section
|
107 |
with st.expander("Image Preprocessing"):
|
|
|
132 |
help="Rotate image if needed")
|
133 |
|
134 |
# Create preprocessing options dictionary
|
135 |
+
# Set document_type based on selection in UI
|
136 |
+
doc_type_for_preprocessing = "standard"
|
137 |
+
if "Handwritten" in doc_type:
|
138 |
+
doc_type_for_preprocessing = "handwritten"
|
139 |
+
elif "Newspaper" in doc_type or "Magazine" in doc_type:
|
140 |
+
doc_type_for_preprocessing = "newspaper"
|
141 |
+
elif "Book" in doc_type or "Publication" in doc_type:
|
142 |
+
doc_type_for_preprocessing = "printed"
|
143 |
+
|
144 |
preprocessing_options = {
|
145 |
+
"document_type": doc_type_for_preprocessing,
|
146 |
"grayscale": grayscale,
|
147 |
"denoise": denoise,
|
148 |
"contrast": contrast,
|
|
|
151 |
|
152 |
# PDF-specific options in an expandable section
|
153 |
with st.expander("PDF Options"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
max_pages = st.number_input("Maximum Pages to Process",
|
155 |
min_value=1,
|
156 |
max_value=20,
|
157 |
value=DEFAULT_MAX_PAGES,
|
158 |
help="Limit the number of pages to process (for multi-page PDFs)")
|
159 |
|
160 |
+
# Set default values for removed options
|
161 |
+
pdf_dpi = DEFAULT_PDF_DPI
|
162 |
+
pdf_rotation = 0
|
|
|
163 |
|
164 |
# Create options dictionary
|
165 |
options = {
|
|
|
177 |
def create_file_uploader():
|
178 |
"""Create and return a file uploader"""
|
179 |
# Add app description
|
180 |
+
st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><div style="font-size: 32px;">📜</div><div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical OCR</h2></div></div>', unsafe_allow_html=True)
|
181 |
+
st.markdown("<p style='font-size: 0.8em; color: #666; text-align: left;'>Made possible by Mistral AI</p>", unsafe_allow_html=True)
|
|
|
|
|
182 |
|
183 |
# Add project framing
|
184 |
st.markdown("""
|
185 |
+
This tool assists scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy, it helps navigate:
|
186 |
+
- **Historical newspapers** with complex layouts
|
187 |
+
- **Handwritten documents** from various periods
|
188 |
+
- **Photos of archival materials**
|
189 |
|
190 |
+
Upload a document to begin, or explore the examples.
|
|
|
|
|
|
|
|
|
191 |
""")
|
192 |
|
193 |
+
# Create file uploader with a more concise label
|
194 |
uploaded_file = st.file_uploader(
|
195 |
+
"Select file",
|
196 |
+
type=["pdf", "png", "jpg"],
|
197 |
help="Upload a PDF or image file for OCR processing"
|
198 |
)
|
199 |
return uploaded_file
|
|
|
201 |
def display_results(result, container, custom_prompt=""):
|
202 |
"""Display OCR results in the provided container"""
|
203 |
with container:
|
204 |
+
# No heading for document metadata - start directly with content
|
205 |
+
|
206 |
+
# Create a compact metadata section
|
207 |
+
meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">'
|
208 |
|
209 |
+
# Document type
|
210 |
+
if 'detected_document_type' in result:
|
211 |
+
meta_html += f'<div><strong>Type:</strong> {result["detected_document_type"]}</div>'
|
212 |
|
213 |
+
# Processing time
|
214 |
+
if 'processing_time' in result:
|
215 |
+
meta_html += f'<div><strong>Time:</strong> {result["processing_time"]:.1f}s</div>'
|
216 |
+
|
217 |
+
# Page information
|
218 |
+
if 'limited_pages' in result:
|
219 |
+
meta_html += f'<div><strong>Pages:</strong> {result["limited_pages"]["processed"]}/{result["limited_pages"]["total"]}</div>'
|
220 |
|
221 |
+
meta_html += '</div>'
|
222 |
+
st.markdown(meta_html, unsafe_allow_html=True)
|
223 |
+
|
224 |
+
# Language metadata on a separate line, Subject Tags below
|
225 |
|
226 |
+
# First show languages if available
|
227 |
+
if 'languages' in result and result['languages']:
|
228 |
+
languages = [lang for lang in result['languages'] if lang is not None]
|
229 |
+
if languages:
|
230 |
+
# Create a dedicated line for Languages
|
231 |
+
lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
|
232 |
+
lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>'
|
233 |
+
|
234 |
+
# Add language tags
|
235 |
+
for lang in languages:
|
236 |
+
# Clean language name if needed
|
237 |
+
clean_lang = str(lang).strip()
|
238 |
+
if clean_lang: # Only add if not empty
|
239 |
+
lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>'
|
240 |
+
|
241 |
+
lang_html += '</div>'
|
242 |
+
st.markdown(lang_html, unsafe_allow_html=True)
|
243 |
+
|
244 |
+
# Create a separate line for Time if we have time-related tags
|
245 |
+
if 'topics' in result and result['topics']:
|
246 |
+
time_tags = [topic for topic in result['topics']
|
247 |
+
if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"])]
|
248 |
+
if time_tags:
|
249 |
+
time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
|
250 |
+
time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
|
251 |
+
for tag in time_tags:
|
252 |
+
time_html += f'<span class="subject-tag tag-time-period">{tag}</span>'
|
253 |
+
time_html += '</div>'
|
254 |
+
st.markdown(time_html, unsafe_allow_html=True)
|
255 |
|
256 |
+
# Then display remaining subject tags if available
|
257 |
if 'topics' in result and result['topics']:
|
258 |
+
# Filter out time-related tags which are already displayed
|
259 |
+
subject_tags = [topic for topic in result['topics']
|
260 |
+
if not any(term in topic.lower() for term in ["century", "pre-", "era", "historical"])]
|
261 |
+
|
262 |
+
if subject_tags:
|
263 |
+
# Create a separate line for Subject Tags
|
264 |
+
tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
|
265 |
+
tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>'
|
266 |
+
tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">'
|
267 |
|
268 |
+
# Generate a badge for each remaining tag
|
269 |
+
for topic in subject_tags:
|
270 |
+
# Determine tag category class
|
271 |
+
tag_class = "subject-tag" # Default class
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
|
273 |
+
# Add specialized class based on category
|
274 |
+
if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
|
275 |
+
tag_class += " tag-language" # Languages
|
276 |
+
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
|
277 |
+
tag_class += " tag-document-type" # Document types
|
278 |
+
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
279 |
+
tag_class += " tag-subject" # Subject domains
|
280 |
+
|
281 |
+
# Add each tag as an inline span
|
282 |
+
tags_html += f'<span class="{tag_class}">{topic}</span>'
|
283 |
+
|
284 |
+
# Close the containers
|
285 |
+
tags_html += '</div></div>'
|
286 |
+
|
287 |
+
# Render the subject tags section
|
288 |
+
st.markdown(tags_html, unsafe_allow_html=True)
|
289 |
|
290 |
+
# No OCR content heading - start directly with tabs
|
|
|
291 |
|
292 |
+
# Check if we have OCR content
|
293 |
+
if 'ocr_contents' in result:
|
294 |
+
# Create a single view instead of tabs
|
295 |
+
content_tab1 = st.container()
|
296 |
+
|
297 |
+
# Check for images in the result to use later
|
298 |
+
has_images = result.get('has_images', False)
|
299 |
+
has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
|
300 |
+
has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and
|
301 |
+
any('images' in page for page in result['raw_response_data']['pages']
|
302 |
+
if isinstance(page, dict)))
|
303 |
|
304 |
+
# Display structured content
|
305 |
with content_tab1:
|
306 |
+
# Display structured content with markdown formatting
|
307 |
if isinstance(result['ocr_contents'], dict):
|
308 |
+
# CSS is now handled in the main layout.py file
|
309 |
+
|
310 |
+
# Function to process text with markdown support
|
311 |
+
def format_markdown_text(text):
|
312 |
+
"""Format text with markdown and handle special patterns"""
|
313 |
+
if not text:
|
314 |
+
return ""
|
315 |
|
316 |
+
import re
|
317 |
+
|
318 |
+
# First, ensure we're working with a string
|
319 |
+
if not isinstance(text, str):
|
320 |
+
text = str(text)
|
321 |
+
|
322 |
+
# Ensure newlines are preserved for proper spacing
|
323 |
+
# Convert any Windows line endings to Unix
|
324 |
+
text = text.replace('\r\n', '\n')
|
325 |
+
|
326 |
+
# Format dates (MM/DD/YYYY or similar patterns)
|
327 |
+
date_pattern = r'\b(0?[1-9]|1[0-2])[\/\-\.](0?[1-9]|[12][0-9]|3[01])[\/\-\.](\d{4}|\d{2})\b'
|
328 |
+
text = re.sub(date_pattern, r'**\g<0>**', text)
|
329 |
+
|
330 |
+
# Detect markdown tables and preserve them
|
331 |
+
table_sections = []
|
332 |
+
non_table_lines = []
|
333 |
+
in_table = False
|
334 |
+
table_buffer = []
|
335 |
+
|
336 |
+
# Process text line by line, preserving tables
|
337 |
+
lines = text.split('\n')
|
338 |
+
for i, line in enumerate(lines):
|
339 |
+
line_stripped = line.strip()
|
340 |
+
|
341 |
+
# Detect table rows by pipe character
|
342 |
+
if '|' in line_stripped and (line_stripped.startswith('|') or line_stripped.endswith('|')):
|
343 |
+
if not in_table:
|
344 |
+
in_table = True
|
345 |
+
if table_buffer:
|
346 |
+
table_buffer = []
|
347 |
+
table_buffer.append(line)
|
348 |
+
|
349 |
+
# Check if the next line is a table separator
|
350 |
+
if i < len(lines) - 1 and '---' in lines[i+1] and '|' in lines[i+1]:
|
351 |
+
table_buffer.append(lines[i+1])
|
352 |
+
|
353 |
+
# Detect table separators (---|---|---)
|
354 |
+
elif in_table and '---' in line_stripped and '|' in line_stripped:
|
355 |
+
table_buffer.append(line)
|
356 |
+
|
357 |
+
# End of table detection
|
358 |
+
elif in_table:
|
359 |
+
# Check if this is still part of the table
|
360 |
+
next_line_is_table = False
|
361 |
+
if i < len(lines) - 1:
|
362 |
+
next_line = lines[i+1].strip()
|
363 |
+
if '|' in next_line and (next_line.startswith('|') or next_line.endswith('|')):
|
364 |
+
next_line_is_table = True
|
365 |
+
|
366 |
+
if not next_line_is_table:
|
367 |
+
in_table = False
|
368 |
+
# Save the complete table
|
369 |
+
if table_buffer:
|
370 |
+
table_sections.append('\n'.join(table_buffer))
|
371 |
+
table_buffer = []
|
372 |
+
# Add current line to non-table lines
|
373 |
+
non_table_lines.append(line)
|
374 |
+
else:
|
375 |
+
# Still part of the table
|
376 |
+
table_buffer.append(line)
|
377 |
+
else:
|
378 |
+
# Not in a table
|
379 |
+
non_table_lines.append(line)
|
380 |
+
|
381 |
+
# Handle any remaining table buffer
|
382 |
+
if in_table and table_buffer:
|
383 |
+
table_sections.append('\n'.join(table_buffer))
|
384 |
+
|
385 |
+
# Process non-table lines
|
386 |
+
processed_lines = []
|
387 |
+
for line in non_table_lines:
|
388 |
+
line_stripped = line.strip()
|
389 |
+
|
390 |
+
# Check if line is in ALL CAPS (and not just a short acronym)
|
391 |
+
if line_stripped and line_stripped.isupper() and len(line_stripped) > 3:
|
392 |
+
# ALL CAPS line - make bold instead of heading to prevent large display
|
393 |
+
processed_lines.append(f"**{line_stripped}**")
|
394 |
+
# Process potential headers (lines ending with colon)
|
395 |
+
elif line_stripped and line_stripped.endswith(':') and len(line_stripped) < 40:
|
396 |
+
# Likely a header - make it bold
|
397 |
+
processed_lines.append(f"**{line_stripped}**")
|
398 |
+
else:
|
399 |
+
# Keep original line with its spacing
|
400 |
+
processed_lines.append(line)
|
401 |
+
|
402 |
+
# Join non-table lines
|
403 |
+
processed_text = '\n'.join(processed_lines)
|
404 |
+
|
405 |
+
# Reinsert tables in the right positions
|
406 |
+
for table in table_sections:
|
407 |
+
# Generate a unique marker for this table
|
408 |
+
marker = f"__TABLE_MARKER_{hash(table) % 10000}__"
|
409 |
+
# Find a good position to insert this table
|
410 |
+
# For now, just append all tables at the end
|
411 |
+
processed_text += f"\n\n{table}\n\n"
|
412 |
+
|
413 |
+
# Make sure paragraphs have proper spacing but not excessive
|
414 |
+
processed_text = re.sub(r'\n{3,}', '\n\n', processed_text)
|
415 |
+
|
416 |
+
# Ensure two newlines between paragraphs for proper markdown rendering
|
417 |
+
processed_text = re.sub(r'([^\n])\n([^\n])', r'\1\n\n\2', processed_text)
|
418 |
+
|
419 |
+
return processed_text
|
420 |
+
|
421 |
+
# Collect all available images from the result
|
422 |
+
available_images = []
|
423 |
+
if has_images and 'pages_data' in result:
|
424 |
+
for page_idx, page in enumerate(result['pages_data']):
|
425 |
+
if 'images' in page and len(page['images']) > 0:
|
426 |
+
for img_idx, img in enumerate(page['images']):
|
427 |
+
if 'image_base64' in img:
|
428 |
+
available_images.append({
|
429 |
+
'source': 'pages_data',
|
430 |
+
'page': page_idx,
|
431 |
+
'index': img_idx,
|
432 |
+
'data': img['image_base64']
|
433 |
+
})
|
434 |
+
|
435 |
+
# Get images from raw response as well
|
436 |
+
if 'raw_response_data' in result:
|
437 |
+
raw_data = result['raw_response_data']
|
438 |
+
if isinstance(raw_data, dict) and 'pages' in raw_data:
|
439 |
+
for page_idx, page in enumerate(raw_data['pages']):
|
440 |
+
if isinstance(page, dict) and 'images' in page:
|
441 |
+
for img_idx, img in enumerate(page['images']):
|
442 |
+
if isinstance(img, dict) and 'base64' in img:
|
443 |
+
available_images.append({
|
444 |
+
'source': 'raw_response',
|
445 |
+
'page': page_idx,
|
446 |
+
'index': img_idx,
|
447 |
+
'data': img['base64']
|
448 |
+
})
|
449 |
+
|
450 |
+
# Extract images for display at the top
|
451 |
+
images_to_display = []
|
452 |
+
|
453 |
+
# First, collect all available images
|
454 |
+
for img_idx, img in enumerate(available_images):
|
455 |
+
if 'data' in img:
|
456 |
+
images_to_display.append({
|
457 |
+
'data': img['data'],
|
458 |
+
'id': img.get('id', f"img_{img_idx}"),
|
459 |
+
'index': img_idx
|
460 |
+
})
|
461 |
+
|
462 |
+
# Display images at the top if available
|
463 |
+
if images_to_display:
|
464 |
+
st.markdown("### Document Images")
|
465 |
+
# Create columns for a grid layout (up to 2 columns to make images larger)
|
466 |
+
cols_count = min(2, len(images_to_display))
|
467 |
+
image_cols = st.columns(cols_count)
|
468 |
+
|
469 |
+
# Display each image in a column with minimal spacing
|
470 |
+
for i, img in enumerate(images_to_display):
|
471 |
+
with image_cols[i % cols_count]:
|
472 |
+
# Compact image display
|
473 |
+
st.image(img['data'], use_container_width=True)
|
474 |
+
st.markdown(f"<p style='margin-top:-5px; font-size:0.8rem; color:#666; text-align:center;'>Document Image {i+1}</p>", unsafe_allow_html=True)
|
475 |
+
|
476 |
+
# Organize sections in a logical order
|
477 |
+
section_order = ["title", "author", "date", "summary", "content", "transcript", "metadata"]
|
478 |
+
ordered_sections = []
|
479 |
+
|
480 |
+
# Add known sections first in preferred order
|
481 |
+
for section_name in section_order:
|
482 |
+
if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
|
483 |
+
ordered_sections.append(section_name)
|
484 |
+
|
485 |
+
# Add any remaining sections
|
486 |
+
for section in result['ocr_contents'].keys():
|
487 |
+
if (section not in ordered_sections and
|
488 |
+
section not in ['error', 'partial_text'] and
|
489 |
+
result['ocr_contents'][section]):
|
490 |
+
ordered_sections.append(section)
|
491 |
+
|
492 |
+
# If only raw_text is available and no other content, add it last
|
493 |
+
if ('raw_text' in result['ocr_contents'] and
|
494 |
+
result['ocr_contents']['raw_text'] and
|
495 |
+
len(ordered_sections) == 0):
|
496 |
+
ordered_sections.append('raw_text')
|
497 |
+
|
498 |
+
# Add minimal spacing before OCR results
|
499 |
+
st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True)
|
500 |
+
st.markdown("### Document Content")
|
501 |
+
|
502 |
+
# Process each section using expanders
|
503 |
+
for i, section in enumerate(ordered_sections):
|
504 |
+
content = result['ocr_contents'][section]
|
505 |
+
|
506 |
+
# Skip empty content
|
507 |
+
if not content:
|
508 |
+
continue
|
509 |
+
|
510 |
+
# Create an expander for each section
|
511 |
+
# First section is expanded by default
|
512 |
+
with st.expander(f"{section.replace('_', ' ').title()}", expanded=(i == 0)):
|
513 |
if isinstance(content, str):
|
514 |
+
# Handle image markdown
|
515 |
+
if content.startswith("![") and content.endswith(")"):
|
516 |
+
try:
|
517 |
+
alt_text = content[2:content.index(']')]
|
518 |
+
st.info(f"Image description: {alt_text if len(alt_text) > 5 else 'Image'}")
|
519 |
+
except:
|
520 |
+
st.info("Contains image reference")
|
521 |
+
else:
|
522 |
+
# Process text content
|
523 |
+
formatted_content = format_markdown_text(content).strip()
|
524 |
+
|
525 |
+
# Check if content contains markdown tables or complex text
|
526 |
+
has_tables = '|' in formatted_content and '---' in formatted_content
|
527 |
+
has_complex_structure = formatted_content.count('\n') > 5 or formatted_content.count('**') > 2
|
528 |
+
|
529 |
+
# Use a container with minimal margins
|
530 |
+
with st.container():
|
531 |
+
# For text-only extractions or content with tables, ensure proper rendering
|
532 |
+
if has_tables or has_complex_structure:
|
533 |
+
# For text with tables or multiple paragraphs, use special handling
|
534 |
+
# First ensure proper markdown spacing
|
535 |
+
formatted_content = formatted_content.replace('\n\n\n', '\n\n')
|
536 |
+
|
537 |
+
# Look for any all caps headers that might be misinterpreted
|
538 |
+
import re
|
539 |
+
formatted_content = re.sub(
|
540 |
+
r'^([A-Z][A-Z\s]+)$',
|
541 |
+
r'**\1**',
|
542 |
+
formatted_content,
|
543 |
+
flags=re.MULTILINE
|
544 |
+
)
|
545 |
+
|
546 |
+
# Preserve table formatting by adding proper spacing
|
547 |
+
if has_tables:
|
548 |
+
formatted_content = formatted_content.replace('\n|', '\n\n|')
|
549 |
+
|
550 |
+
# Add proper paragraph spacing
|
551 |
+
formatted_content = re.sub(r'([^\n])\n([^\n])', r'\1\n\n\2', formatted_content)
|
552 |
+
|
553 |
+
# Use standard markdown with custom styling
|
554 |
+
st.markdown(formatted_content, unsafe_allow_html=False)
|
555 |
+
else:
|
556 |
+
# For simpler content, use standard markdown
|
557 |
+
st.markdown(formatted_content)
|
558 |
+
|
559 |
elif isinstance(content, list):
|
560 |
+
# Create markdown list
|
561 |
+
list_items = []
|
562 |
for item in content:
|
563 |
if isinstance(item, str):
|
564 |
+
item_text = format_markdown_text(item).strip()
|
565 |
+
# Handle potential HTML special characters for proper rendering
|
566 |
+
item_text = item_text.replace('<', '<').replace('>', '>')
|
567 |
+
list_items.append(f"- {item_text}")
|
568 |
else:
|
569 |
+
list_items.append(f"- {str(item)}")
|
570 |
+
|
571 |
+
list_content = "\n".join(list_items)
|
572 |
+
|
573 |
+
# Use a container with minimal margins
|
574 |
+
with st.container():
|
575 |
+
# Use standard markdown for better rendering
|
576 |
+
st.markdown(list_content)
|
577 |
+
|
578 |
elif isinstance(content, dict):
|
579 |
+
# Format dictionary content
|
580 |
+
dict_items = []
|
581 |
for k, v in content.items():
|
582 |
+
key_formatted = k.replace('_', ' ').title()
|
583 |
+
|
584 |
+
if isinstance(v, str):
|
585 |
+
value_formatted = format_markdown_text(v).strip()
|
586 |
+
dict_items.append(f"**{key_formatted}:** {value_formatted}")
|
587 |
+
else:
|
588 |
+
dict_items.append(f"**{key_formatted}:** {str(v)}")
|
589 |
+
|
590 |
+
dict_content = "\n".join(dict_items)
|
591 |
+
|
592 |
+
# Use a container with minimal margins
|
593 |
+
with st.container():
|
594 |
+
# Use standard markdown for better rendering
|
595 |
+
st.markdown(dict_content)
|
596 |
|
597 |
+
# Display custom prompt if provided
|
598 |
+
if custom_prompt:
|
599 |
+
with st.expander("Custom Processing Instructions"):
|
600 |
+
st.write(custom_prompt)
|
601 |
+
|
602 |
+
# No download heading - start directly with buttons
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
603 |
|
604 |
+
# JSON download - use full width for buttons
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
605 |
try:
|
606 |
json_str = json.dumps(result, indent=2)
|
607 |
st.download_button(
|
|
|
612 |
)
|
613 |
except Exception as e:
|
614 |
st.error(f"Error creating JSON download: {str(e)}")
|
615 |
+
|
|
|
616 |
# Text download
|
617 |
try:
|
618 |
if 'ocr_contents' in result:
|
|
|
636 |
|
637 |
def display_document_with_images(result):
|
638 |
"""Display document with images"""
|
639 |
+
# Check for pages_data first
|
640 |
+
if 'pages_data' in result and result['pages_data']:
|
641 |
+
pages_data = result['pages_data']
|
642 |
+
# If pages_data not available, try to extract from raw_response_data
|
643 |
+
elif 'raw_response_data' in result and isinstance(result['raw_response_data'], dict) and 'pages' in result['raw_response_data']:
|
644 |
+
# Build pages_data from raw_response_data
|
645 |
+
pages_data = []
|
646 |
+
raw_pages = result['raw_response_data']['pages']
|
647 |
+
|
648 |
+
for page_idx, page in enumerate(raw_pages):
|
649 |
+
if not isinstance(page, dict):
|
650 |
+
continue
|
651 |
+
|
652 |
+
page_data = {
|
653 |
+
'page_number': page_idx + 1,
|
654 |
+
'markdown': page.get('markdown', ''),
|
655 |
+
'images': []
|
656 |
+
}
|
657 |
+
|
658 |
+
# Extract images if present
|
659 |
+
if 'images' in page and isinstance(page['images'], list):
|
660 |
+
for img_idx, img in enumerate(page['images']):
|
661 |
+
if isinstance(img, dict) and ('base64' in img or 'image_base64' in img):
|
662 |
+
img_base64 = img.get('image_base64', img.get('base64', ''))
|
663 |
+
if img_base64:
|
664 |
+
page_data['images'].append({
|
665 |
+
'id': img.get('id', f"img_{page_idx}_{img_idx}"),
|
666 |
+
'image_base64': img_base64
|
667 |
+
})
|
668 |
+
|
669 |
+
if page_data['markdown'] or page_data['images']:
|
670 |
+
pages_data.append(page_data)
|
671 |
+
else:
|
672 |
st.info("No image data available.")
|
673 |
return
|
674 |
|
675 |
# Display each page
|
676 |
+
for i, page_data in enumerate(pages_data):
|
677 |
st.markdown(f"### Page {i+1}")
|
678 |
|
679 |
# Create columns for image and text
|
680 |
img_col, text_col = st.columns([1, 1])
|
681 |
|
682 |
with img_col:
|
683 |
+
# Display the image - check multiple possible field names
|
684 |
+
image_displayed = False
|
685 |
+
|
686 |
+
# Try 'image_data' field first
|
687 |
if 'image_data' in page_data:
|
688 |
try:
|
689 |
# Convert base64 to image
|
690 |
image_data = base64.b64decode(page_data['image_data'])
|
691 |
st.image(io.BytesIO(image_data), use_container_width=True)
|
692 |
+
image_displayed = True
|
693 |
except Exception as e:
|
694 |
+
st.error(f"Error displaying image from image_data: {str(e)}")
|
695 |
+
|
696 |
+
# Try 'images' array if image_data didn't work
|
697 |
+
if not image_displayed and 'images' in page_data and len(page_data['images']) > 0:
|
698 |
+
for img in page_data['images']:
|
699 |
+
if 'image_base64' in img:
|
700 |
+
try:
|
701 |
+
st.image(img['image_base64'], use_container_width=True)
|
702 |
+
image_displayed = True
|
703 |
+
break
|
704 |
+
except Exception as e:
|
705 |
+
st.error(f"Error displaying image from images array: {str(e)}")
|
706 |
+
|
707 |
+
if not image_displayed:
|
708 |
st.info("No image available for this page.")
|
709 |
|
710 |
with text_col:
|
711 |
+
# Get text from various possible fields
|
712 |
+
page_text = ""
|
713 |
if 'text' in page_data:
|
714 |
+
page_text = page_data['text']
|
715 |
+
elif 'markdown' in page_data:
|
716 |
+
page_text = page_data['markdown']
|
717 |
+
|
718 |
+
# Special handling for image markdown in page data
|
719 |
+
if page_text.startswith("![") and page_text.endswith(")"):
|
720 |
+
# Try to display image if not already displayed
|
721 |
+
if not image_displayed and 'raw_response_data' in result:
|
722 |
+
raw_data = result['raw_response_data']
|
723 |
+
if isinstance(raw_data, dict) and 'pages' in raw_data:
|
724 |
+
for raw_page in raw_data['pages']:
|
725 |
+
if isinstance(raw_page, dict) and 'images' in raw_page:
|
726 |
+
for img in raw_page['images']:
|
727 |
+
if isinstance(img, dict) and 'base64' in img:
|
728 |
+
st.image(img['base64'])
|
729 |
+
st.caption("Image from OCR response")
|
730 |
+
image_displayed = True
|
731 |
+
break
|
732 |
+
if image_displayed:
|
733 |
+
break
|
734 |
+
|
735 |
+
# Try to extract alt text
|
736 |
+
try:
|
737 |
+
alt_text = page_text[2:page_text.index(']')]
|
738 |
+
if alt_text and len(alt_text) > 5: # Only show if alt text is meaningful
|
739 |
+
st.info(f"Image description: {alt_text}")
|
740 |
+
else:
|
741 |
+
st.info("This page contains an image with minimal text")
|
742 |
+
except:
|
743 |
+
st.info("This page contains an image with minimal text")
|
744 |
+
|
745 |
+
# Show warning if no image displayed
|
746 |
+
if not image_displayed:
|
747 |
+
st.warning("Image reference found in text, but no image data is available.")
|
748 |
+
|
749 |
+
# If no text found but we have raw_text in ocr_contents
|
750 |
+
if not page_text and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
|
751 |
+
page_text = result['ocr_contents']['raw_text']
|
752 |
+
|
753 |
+
# Display the text with editing capability
|
754 |
+
if page_text:
|
755 |
+
edited_text = st.text_area(f"Page {i+1} Text", page_text, height=300, key=f"page_text_{i}")
|
756 |
|
757 |
+
# Add a simple button to copy the edited text to clipboard
|
758 |
+
st.button(f"Copy Text", key=f"copy_btn_{i}")
|
|
|
759 |
else:
|
760 |
st.info("No text available for this page.")
|
761 |
|
762 |
def display_previous_results():
|
763 |
+
"""Display previous results tab content in a simplified, structured view"""
|
|
|
764 |
|
765 |
+
# Use a clean header with the download button directly next to it
|
766 |
+
col1, col2 = st.columns([3, 1])
|
767 |
+
with col1:
|
768 |
+
st.header("Previous Results")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
769 |
|
770 |
# Display previous results if available
|
771 |
if not st.session_state.previous_results:
|
772 |
st.markdown("""
|
773 |
+
<div style="text-align: center; padding: 30px 20px; background-color: #f8f9fa; border-radius: 6px; margin-top: 10px;">
|
774 |
+
<div style="font-size: 36px; margin-bottom: 15px;">📄</div>
|
775 |
+
<h4 style="margin-bottom: 8px; font-weight: 500;">No Previous Results</h4>
|
776 |
+
<p style="font-size: 14px; color: #666;">Process a document to see your results history.</p>
|
777 |
</div>
|
778 |
""", unsafe_allow_html=True)
|
779 |
else:
|
780 |
+
# Add download button in the second column next to the header
|
781 |
+
with col2:
|
782 |
+
try:
|
783 |
+
# Create download button for all results
|
784 |
+
from ocr_utils import create_results_zip_in_memory
|
785 |
+
zip_data = create_results_zip_in_memory(st.session_state.previous_results)
|
786 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
787 |
|
788 |
+
# Simplified filename
|
789 |
+
zip_filename = f"ocr_results_{timestamp}.zip"
|
790 |
+
|
791 |
+
st.download_button(
|
792 |
+
label="Download All",
|
793 |
+
data=zip_data,
|
794 |
+
file_name=zip_filename,
|
795 |
+
mime="application/zip",
|
796 |
+
help="Download all results as ZIP"
|
797 |
+
)
|
798 |
+
except Exception:
|
799 |
+
# Silent fail - no error message to keep UI clean
|
800 |
+
pass
|
801 |
|
802 |
+
# Create a cleaner, more minimal grid for results using Streamlit columns
|
803 |
+
# Calculate number of columns based on screen width - more responsive
|
804 |
+
num_columns = 2 # Two columns for most screens
|
805 |
+
|
806 |
+
# Create rows of result cards
|
807 |
+
for i in range(0, len(st.session_state.previous_results), num_columns):
|
808 |
+
# Create a row of columns
|
809 |
+
cols = st.columns(num_columns)
|
810 |
+
|
811 |
+
# Fill each column with a result card
|
812 |
+
for j in range(num_columns):
|
813 |
+
index = i + j
|
814 |
+
if index < len(st.session_state.previous_results):
|
815 |
+
result = st.session_state.previous_results[index]
|
816 |
|
817 |
+
# Get basic info for the card
|
818 |
+
file_name = result.get("file_name", f"Document {index+1}")
|
819 |
+
timestamp = result.get("timestamp", "")
|
820 |
|
821 |
+
# Determine file type icon
|
822 |
+
if file_name.lower().endswith(".pdf"):
|
823 |
+
icon = "📄"
|
824 |
+
elif any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif"]):
|
825 |
+
icon = "🖼️"
|
|
|
|
|
826 |
else:
|
827 |
+
icon = "📝"
|
828 |
|
829 |
+
# Display a simplified card in each column
|
830 |
+
with cols[j]:
|
831 |
+
# Use a container for better styling control
|
832 |
+
with st.container():
|
833 |
+
# Create visually cleaner card with less vertical space
|
834 |
+
st.markdown(f"""
|
835 |
+
<div style="padding: 10px; border: 1px solid #e0e0e0; border-radius: 6px; margin-bottom: 10px;">
|
836 |
+
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 5px;">
|
837 |
+
<div style="font-weight: 500; font-size: 14px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">{icon} {file_name}</div>
|
838 |
+
<div style="color: #666; font-size: 12px;">{timestamp.split()[0] if timestamp else ""}</div>
|
839 |
+
</div>
|
840 |
+
</div>
|
841 |
+
""", unsafe_allow_html=True)
|
842 |
+
|
843 |
+
# Add a simple button below each card
|
844 |
+
if st.button(f"View", key=f"view_{index}", help=f"View {file_name}"):
|
845 |
+
st.session_state.selected_previous_result = st.session_state.previous_results[index]
|
846 |
+
st.rerun()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
847 |
|
848 |
# Display the selected result if available
|
849 |
if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
|
850 |
selected_result = st.session_state.selected_previous_result
|
851 |
|
852 |
+
# Draw a separator between results list and selected document
|
853 |
+
st.markdown("<hr style='margin: 20px 0 15px 0; border: none; height: 1px; background-color: #eee;'>", unsafe_allow_html=True)
|
854 |
+
|
855 |
+
# Create a cleaner header for the selected document
|
856 |
+
file_name = selected_result.get('file_name', 'Document')
|
857 |
+
st.subheader(f"{file_name}")
|
858 |
+
|
859 |
+
# Add a simple back button at the top
|
860 |
+
if st.button("← Back to Results", key="back_to_results"):
|
861 |
+
if 'selected_previous_result' in st.session_state:
|
862 |
+
del st.session_state.selected_previous_result
|
863 |
+
st.session_state.perform_reset = True
|
864 |
+
st.rerun()
|
865 |
+
|
866 |
+
# Simplified metadata display - just one line with essential info
|
867 |
+
meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 12px; margin: 8px 0 15px 0; font-size: 14px; color: #666;">'
|
868 |
+
|
869 |
+
# Add timestamp
|
870 |
+
if 'timestamp' in selected_result:
|
871 |
+
meta_html += f'<div>{selected_result["timestamp"]}</div>'
|
872 |
|
873 |
+
# Add languages if available (simplified)
|
874 |
+
if 'languages' in selected_result and selected_result['languages']:
|
875 |
+
languages = [lang for lang in selected_result['languages'] if lang is not None]
|
876 |
+
if languages:
|
877 |
+
meta_html += f'<div>Language: {", ".join(languages)}</div>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
878 |
|
879 |
+
# Add page count if available (simplified)
|
880 |
+
if 'limited_pages' in selected_result:
|
881 |
+
meta_html += f'<div>Pages: {selected_result["limited_pages"]["processed"]}/{selected_result["limited_pages"]["total"]}</div>'
|
|
|
882 |
|
883 |
+
meta_html += '</div>'
|
884 |
+
st.markdown(meta_html, unsafe_allow_html=True)
|
|
|
885 |
|
886 |
+
# Simplified tabs - fewer options for cleaner interface
|
887 |
has_images = selected_result.get('has_images', False)
|
888 |
if has_images:
|
889 |
+
view_tabs = st.tabs(["Document Content", "Raw Text", "Images"])
|
890 |
+
view_tab1, view_tab2, view_tab3 = view_tabs
|
891 |
else:
|
892 |
+
view_tabs = st.tabs(["Document Content", "Raw Text"])
|
893 |
+
view_tab1, view_tab2 = view_tabs
|
894 |
|
895 |
+
# Define helper function for formatting text
|
896 |
+
def format_text_display(text):
|
897 |
+
if not isinstance(text, str):
|
898 |
+
return text
|
899 |
+
|
900 |
+
lines = text.split('\n')
|
901 |
+
processed_lines = []
|
902 |
+
for line in lines:
|
903 |
+
line_stripped = line.strip()
|
904 |
+
if line_stripped and line_stripped.isupper() and len(line_stripped) > 3:
|
905 |
+
processed_lines.append(f"**{line_stripped}**")
|
906 |
+
else:
|
907 |
+
processed_lines.append(line)
|
908 |
+
|
909 |
+
return '\n'.join(processed_lines)
|
910 |
+
|
911 |
+
# First tab - Document Content (simplified structured view)
|
912 |
with view_tab1:
|
913 |
+
# Display content in a cleaner, more streamlined format
|
914 |
if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
|
915 |
+
# Create a more focused list of important sections
|
916 |
+
priority_sections = ["title", "content", "transcript", "summary", "raw_text"]
|
917 |
+
displayed_sections = set()
|
918 |
+
|
919 |
+
# First display priority sections
|
920 |
+
for section in priority_sections:
|
921 |
+
if section in selected_result['ocr_contents'] and selected_result['ocr_contents'][section]:
|
922 |
+
content = selected_result['ocr_contents'][section]
|
923 |
+
if isinstance(content, str) and content.strip():
|
924 |
+
# Only add a subheader for meaningful section names, not raw_text
|
925 |
+
if section != "raw_text":
|
926 |
+
st.markdown(f"##### {section.replace('_', ' ').title()}")
|
927 |
+
|
928 |
+
# Format and display content
|
929 |
+
formatted_content = format_text_display(content)
|
930 |
+
st.markdown(formatted_content)
|
931 |
+
displayed_sections.add(section)
|
932 |
+
|
933 |
+
# Then display any remaining sections not already shown
|
934 |
for section, content in selected_result['ocr_contents'].items():
|
935 |
+
if (section not in displayed_sections and
|
936 |
+
section not in ['error', 'partial_text'] and
|
937 |
+
content):
|
938 |
+
st.markdown(f"##### {section.replace('_', ' ').title()}")
|
939 |
|
940 |
if isinstance(content, str):
|
941 |
+
st.markdown(format_text_display(content))
|
942 |
elif isinstance(content, list):
|
943 |
for item in content:
|
944 |
+
st.markdown(f"- {item}")
|
|
|
|
|
|
|
945 |
elif isinstance(content, dict):
|
946 |
for k, v in content.items():
|
947 |
+
st.markdown(f"**{k}:** {v}")
|
948 |
|
949 |
+
# Second tab - Raw Text (simplified)
|
950 |
with view_tab2:
|
951 |
+
# Extract raw text or content
|
952 |
raw_text = ""
|
953 |
if 'ocr_contents' in selected_result:
|
954 |
if 'raw_text' in selected_result['ocr_contents']:
|
|
|
956 |
elif 'content' in selected_result['ocr_contents']:
|
957 |
raw_text = selected_result['ocr_contents']['content']
|
958 |
|
959 |
+
# Display the text area with raw text
|
960 |
+
edited_text = st.text_area("", raw_text, height=300, key="selected_raw_text")
|
|
|
|
|
|
|
|
|
961 |
|
962 |
+
# Add buttons in a row
|
963 |
+
col1, col2 = st.columns(2)
|
964 |
+
with col1:
|
965 |
+
st.button("Copy Text", key="selected_copy_btn")
|
966 |
+
with col2:
|
967 |
+
st.download_button(
|
968 |
+
label="Download Text",
|
969 |
+
data=edited_text,
|
970 |
+
file_name=f"{file_name.split('.')[0]}_text.txt",
|
971 |
+
mime="text/plain",
|
972 |
+
key="selected_download_btn"
|
973 |
+
)
|
974 |
|
975 |
+
# Third tab - With Images (simplified)
|
976 |
if has_images and 'pages_data' in selected_result:
|
977 |
with view_tab3:
|
978 |
+
# Simplified image display
|
979 |
+
if 'pages_data' in selected_result:
|
980 |
+
for i, page_data in enumerate(selected_result['pages_data']):
|
981 |
+
# Display each page
|
982 |
+
if 'images' in page_data and len(page_data['images']) > 0:
|
983 |
+
for img in page_data['images']:
|
984 |
+
if 'image_base64' in img:
|
985 |
+
st.image(img['image_base64'], use_column_width=True)
|
986 |
+
|
987 |
+
# Get page text if available
|
988 |
+
page_text = ""
|
989 |
+
if 'markdown' in page_data:
|
990 |
+
page_text = page_data['markdown']
|
991 |
+
|
992 |
+
# Display text if available
|
993 |
+
if page_text:
|
994 |
+
with st.expander(f"Page {i+1} Text", expanded=False):
|
995 |
+
st.text(page_text)
|
996 |
|
997 |
def display_about_tab():
|
998 |
"""Display about tab content"""
|
999 |
+
st.header("About")
|
1000 |
|
1001 |
# Add app description
|
1002 |
st.markdown("""
|
1003 |
**Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
|
1004 |
+
""")
|
1005 |
|
1006 |
+
# Purpose section with consistent formatting
|
1007 |
+
st.markdown("### Purpose")
|
1008 |
+
st.markdown("""
|
1009 |
This tool is designed to assist scholars in historical research by extracting text from challenging documents.
|
1010 |
While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
|
1011 |
historical documents, particularly:
|
1012 |
+
""")
|
1013 |
|
1014 |
+
st.markdown("""
|
1015 |
- **Historical newspapers** with complex layouts and aged text
|
1016 |
- **Handwritten documents** from various time periods
|
1017 |
- **Photos of archival materials** that may be difficult to read
|
1018 |
+
""")
|
1019 |
|
1020 |
+
# Features section with consistent formatting
|
1021 |
+
st.markdown("### Features")
|
1022 |
+
st.markdown("""
|
1023 |
- **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
|
1024 |
- **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
|
1025 |
- **Editable Results**: Review and edit extracted text directly in the interface
|
1026 |
- **Structured Content Analysis**: Automatic organization of document content
|
1027 |
- **Multi-language Support**: Process documents in various languages
|
1028 |
- **PDF Processing**: Handle multi-page historical documents
|
1029 |
+
""")
|
1030 |
|
1031 |
+
# How to Use section with consistent formatting
|
1032 |
+
st.markdown("### How to Use")
|
1033 |
+
st.markdown("""
|
1034 |
1. Upload a document (PDF or image)
|
1035 |
2. Select the document type and adjust preprocessing options if needed
|
1036 |
3. Add custom processing instructions for specialized documents
|
1037 |
4. Process the document
|
1038 |
5. Review, edit, and download the results
|
1039 |
+
""")
|
1040 |
|
1041 |
+
# Technologies section with consistent formatting
|
1042 |
+
st.markdown("### Technologies")
|
1043 |
+
st.markdown("""
|
1044 |
- OCR processing using Mistral AI's advanced document understanding capabilities
|
1045 |
- Image preprocessing with OpenCV
|
1046 |
- PDF handling with pdf2image
|
utils.py
CHANGED
@@ -13,12 +13,76 @@ logger = logging.getLogger("utils")
|
|
13 |
logger.setLevel(logging.INFO)
|
14 |
|
15 |
def get_base64_from_image(image_path):
|
16 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
try:
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
except Exception as e:
|
21 |
-
logger.error(f"Error encoding
|
22 |
return ""
|
23 |
|
24 |
def timing(description):
|
|
|
13 |
logger.setLevel(logging.INFO)
|
14 |
|
15 |
def get_base64_from_image(image_path):
|
16 |
+
"""
|
17 |
+
Get base64 data URL from image file with proper MIME type.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
image_path: Path to the image file
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
Base64 data URL with appropriate MIME type prefix
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
# Convert to Path object for better handling
|
27 |
+
path_obj = Path(image_path)
|
28 |
+
|
29 |
+
# Determine mime type based on file extension
|
30 |
+
mime_type = 'image/jpeg' # Default mime type
|
31 |
+
suffix = path_obj.suffix.lower()
|
32 |
+
if suffix == '.png':
|
33 |
+
mime_type = 'image/png'
|
34 |
+
elif suffix == '.gif':
|
35 |
+
mime_type = 'image/gif'
|
36 |
+
elif suffix in ['.jpg', '.jpeg']:
|
37 |
+
mime_type = 'image/jpeg'
|
38 |
+
elif suffix == '.pdf':
|
39 |
+
mime_type = 'application/pdf'
|
40 |
+
|
41 |
+
# Read and encode file
|
42 |
+
with open(path_obj, "rb") as file:
|
43 |
+
encoded = base64.b64encode(file.read()).decode('utf-8')
|
44 |
+
return f"data:{mime_type};base64,{encoded}"
|
45 |
+
except Exception as e:
|
46 |
+
logger.error(f"Error encoding file to base64: {str(e)}")
|
47 |
+
return ""
|
48 |
+
|
49 |
+
def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None):
|
50 |
+
"""
|
51 |
+
Get base64 data URL from file bytes with proper MIME type.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
file_bytes: Binary file data
|
55 |
+
mime_type: MIME type of the file (optional)
|
56 |
+
file_name: Original file name for MIME type detection (optional)
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
Base64 data URL with appropriate MIME type prefix
|
60 |
+
"""
|
61 |
try:
|
62 |
+
# Determine mime type if not provided
|
63 |
+
if mime_type is None and file_name is not None:
|
64 |
+
# Get file extension
|
65 |
+
suffix = Path(file_name).suffix.lower()
|
66 |
+
if suffix == '.png':
|
67 |
+
mime_type = 'image/png'
|
68 |
+
elif suffix == '.gif':
|
69 |
+
mime_type = 'image/gif'
|
70 |
+
elif suffix in ['.jpg', '.jpeg']:
|
71 |
+
mime_type = 'image/jpeg'
|
72 |
+
elif suffix == '.pdf':
|
73 |
+
mime_type = 'application/pdf'
|
74 |
+
else:
|
75 |
+
# Default to octet-stream for unknown types
|
76 |
+
mime_type = 'application/octet-stream'
|
77 |
+
elif mime_type is None:
|
78 |
+
# Default MIME type if we can't determine it
|
79 |
+
mime_type = 'application/octet-stream'
|
80 |
+
|
81 |
+
# Encode and create data URL
|
82 |
+
encoded = base64.b64encode(file_bytes).decode('utf-8')
|
83 |
+
return f"data:{mime_type};base64,{encoded}"
|
84 |
except Exception as e:
|
85 |
+
logger.error(f"Error encoding bytes to base64: {str(e)}")
|
86 |
return ""
|
87 |
|
88 |
def timing(description):
|