milwright commited on
Commit
7647e70
·
1 Parent(s): 8d3bfba

Reconcile main with preview-improvements branch, implementing modular structure, raw text editing, and enhanced framing

Browse files
Files changed (9) hide show
  1. app.py +0 -0
  2. constants.py +110 -0
  3. error_handler.py +65 -0
  4. ocr_processing.py +279 -0
  5. preprocessing.py +180 -0
  6. ui/custom.css +222 -335
  7. ui/layout.py +210 -20
  8. ui_components.py +774 -0
  9. utils.py +263 -0
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
constants.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Constants for the Historical OCR application.
3
+
4
+ This module contains all the constants used throughout the application,
5
+ making it easier to maintain and update values in one place.
6
+ """
7
+
8
+ # API limits
9
+ MAX_FILE_SIZE_MB = 50
10
+ MAX_PAGES = 20
11
+
12
+ # Caching
13
+ CACHE_TTL_SECONDS = 24 * 3600 # 24 hours
14
+ MAX_CACHE_ENTRIES = 20
15
+
16
+ # Image processing
17
+ MAX_IMAGE_DIMENSION = 2500
18
+ IMAGE_QUALITY = 92
19
+
20
+ # Document types
21
+ DOCUMENT_TYPES = [
22
+ "Auto-detect (standard processing)",
23
+ "Newspaper or Magazine",
24
+ "Letter or Correspondence",
25
+ "Book or Publication",
26
+ "Form or Legal Document",
27
+ "Recipe",
28
+ "Handwritten Document",
29
+ "Map or Illustration",
30
+ "Table or Spreadsheet",
31
+ "Other (specify in instructions)"
32
+ ]
33
+
34
+ # Document layouts
35
+ DOCUMENT_LAYOUTS = [
36
+ "Standard layout",
37
+ "Multiple columns",
38
+ "Table/grid format",
39
+ "Mixed layout with images"
40
+ ]
41
+
42
+ # Preprocessing document types
43
+ PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]
44
+
45
+ # Rotation options
46
+ ROTATION_OPTIONS = [0, 90, 180, 270]
47
+
48
+ # PDF settings
49
+ DEFAULT_PDF_DPI = 100
50
+ MIN_PDF_DPI = 72
51
+ MAX_PDF_DPI = 300
52
+ DEFAULT_MAX_PAGES = 3
53
+
54
+ # Performance modes
55
+ PERFORMANCE_MODES = ["Quality", "Speed"]
56
+
57
+ # Custom prompt templates
58
+ CUSTOM_PROMPT_TEMPLATES = {
59
+ "Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
60
+ "Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
61
+ "Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
62
+ "Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
63
+ "Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
64
+ "Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
65
+ "Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
66
+ "Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
67
+ "Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
68
+ }
69
+
70
+ # Layout prompt additions
71
+ LAYOUT_PROMPT_ADDITIONS = {
72
+ "Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
73
+ "Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
74
+ "Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
75
+ }
76
+
77
+ # Content themes for subject tag extraction
78
+ CONTENT_THEMES = {
79
+ "Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"],
80
+ "Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"],
81
+ "Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"],
82
+ "Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"],
83
+ "Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"],
84
+ "Education": ["education", "school", "university", "college", "learning", "student", "teach"],
85
+ "Politics": ["government", "political", "policy", "administration", "election", "legislature"],
86
+ "Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"],
87
+ "Social": ["society", "community", "social", "culture", "tradition", "customs"],
88
+ "Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"],
89
+ "Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"],
90
+ "Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"],
91
+ "Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"],
92
+ "Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"],
93
+ "Correspondence": ["letter", "mail", "correspondence", "message", "communication"]
94
+ }
95
+
96
+ # Period tags based on year ranges
97
+ PERIOD_TAGS = {
98
+ (0, 1799): "Pre-1800s",
99
+ (1800, 1849): "Early 19th Century",
100
+ (1850, 1899): "Late 19th Century",
101
+ (1900, 1949): "Early 20th Century",
102
+ (1950, 2099): "Modern Era"
103
+ }
104
+
105
+ # Default fallback tags
106
+ DEFAULT_TAGS = ["Document", "Historical", "Text"]
107
+ GENERIC_TAGS = ["Archive", "Content", "Record"]
108
+
109
+ # UI constants
110
+ PROGRESS_DELAY = 0.8 # Seconds to show completion message
error_handler.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import streamlit as st
3
+ import time
4
+ from constants import MAX_FILE_SIZE_MB
5
+
6
+ # Configure logging
7
+ logger = logging.getLogger("error_handler")
8
+ logger.setLevel(logging.INFO)
9
+
10
+ def handle_ocr_error(exception, progress_reporter=None):
11
+ """
12
+ Handle OCR processing errors and provide user-friendly messages
13
+
14
+ Args:
15
+ exception: The exception that occurred
16
+ progress_reporter: ProgressReporter instance for UI updates
17
+
18
+ Returns:
19
+ str: User-friendly error message
20
+ """
21
+ error_message = str(exception)
22
+
23
+ # Complete progress reporting if provided
24
+ if progress_reporter:
25
+ progress_reporter.complete(success=False)
26
+
27
+ # Check for specific error types and provide helpful user-facing messages
28
+ if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
29
+ friendly_message = "The AI service is currently experiencing high demand. Please try again in a few minutes."
30
+ logger.error(f"Rate limit error: {error_message}")
31
+ return friendly_message
32
+ elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
33
+ friendly_message = "The API usage quota has been reached. Please check your API key and subscription limits."
34
+ logger.error(f"API quota error: {error_message}")
35
+ return friendly_message
36
+ elif "timeout" in error_message.lower() or "timed out" in error_message.lower():
37
+ friendly_message = "The request timed out. This may be due to a large document or high server load. Please try again or use a smaller document."
38
+ logger.error(f"Timeout error: {error_message}")
39
+ return friendly_message
40
+ elif "file size" in error_message.lower() or "too large" in error_message.lower():
41
+ friendly_message = f"The file is too large. Maximum file size is {MAX_FILE_SIZE_MB}MB."
42
+ logger.error(f"File size error: {error_message}")
43
+ return friendly_message
44
+ else:
45
+ # Generic error message for other errors
46
+ logger.error(f"OCR processing error: {error_message}", exc_info=True)
47
+ return f"An error occurred during processing: {error_message}"
48
+
49
+ def check_file_size(file_bytes):
50
+ """
51
+ Check if file size is within limits
52
+
53
+ Args:
54
+ file_bytes: File content as bytes
55
+
56
+ Returns:
57
+ tuple: (is_valid, file_size_mb, error_message)
58
+ """
59
+ file_size_mb = len(file_bytes) / (1024 * 1024)
60
+
61
+ if file_size_mb > MAX_FILE_SIZE_MB:
62
+ error_message = f"File size {file_size_mb:.2f} MB exceeds limit of {MAX_FILE_SIZE_MB} MB"
63
+ return False, file_size_mb, error_message
64
+
65
+ return True, file_size_mb, None
ocr_processing.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import hashlib
3
+ import tempfile
4
+ import streamlit as st
5
+ import logging
6
+ import time
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from structured_ocr import StructuredOCR
10
+ from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
11
+ from preprocessing import apply_preprocessing_to_file
12
+ from error_handler import handle_ocr_error, check_file_size
13
+
14
+ # Configure logging
15
+ logger = logging.getLogger("ocr_processing")
16
+ logger.setLevel(logging.INFO)
17
+
18
+ @st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
19
+ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None):
20
+ """
21
+ Cached version of OCR processing to reuse results
22
+
23
+ Args:
24
+ file_path: Path to the file to process
25
+ file_type: Type of file (pdf or image)
26
+ use_vision: Whether to use vision model
27
+ file_size_mb: File size in MB
28
+ cache_key: Cache key for the file
29
+ preprocessing_options_hash: Hash of preprocessing options
30
+
31
+ Returns:
32
+ dict: OCR result
33
+ """
34
+ # Initialize OCR processor
35
+ processor = StructuredOCR()
36
+
37
+ # Process the file
38
+ with timing(f"OCR processing of {file_type} file"):
39
+ result = processor.process_file(
40
+ file_path,
41
+ file_type=file_type,
42
+ use_vision=use_vision,
43
+ file_size_mb=file_size_mb
44
+ )
45
+
46
+ return result
47
+
48
+ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_reporter=None,
49
+ pdf_dpi=150, max_pages=3, pdf_rotation=0, custom_prompt=None, perf_mode="Quality"):
50
+ """
51
+ Process the uploaded file and return the OCR results
52
+
53
+ Args:
54
+ uploaded_file: The uploaded file to process
55
+ use_vision: Whether to use vision model
56
+ preprocessing_options: Dictionary of preprocessing options
57
+ progress_reporter: ProgressReporter instance for UI updates
58
+ pdf_dpi: DPI for PDF conversion
59
+ max_pages: Maximum number of pages to process
60
+ pdf_rotation: PDF rotation value
61
+ custom_prompt: Custom prompt for OCR
62
+ perf_mode: Performance mode (Quality or Speed)
63
+
64
+ Returns:
65
+ dict: OCR result
66
+ """
67
+ if preprocessing_options is None:
68
+ preprocessing_options = {}
69
+
70
+ # Create a container for progress indicators if not provided
71
+ if progress_reporter is None:
72
+ from ui_components import ProgressReporter
73
+ progress_reporter = ProgressReporter(st.empty()).setup()
74
+
75
+ # Initialize temporary file paths list
76
+ temp_file_paths = []
77
+
78
+ try:
79
+ # Check if file size exceeds maximum allowed size
80
+ is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
81
+ if not is_valid:
82
+ progress_reporter.complete(success=False)
83
+ st.error(error_message)
84
+ return {
85
+ "file_name": uploaded_file.name,
86
+ "topics": ["Document"],
87
+ "languages": ["English"],
88
+ "error": error_message,
89
+ "ocr_contents": {
90
+ "error": error_message,
91
+ "partial_text": "Document could not be processed due to size limitations."
92
+ }
93
+ }
94
+
95
+ # Update progress
96
+ progress_reporter.update(10, "Initializing OCR processor...")
97
+
98
+ # Determine file type from extension
99
+ file_ext = Path(uploaded_file.name).suffix.lower()
100
+ file_type = "pdf" if file_ext == ".pdf" else "image"
101
+ file_bytes = uploaded_file.getvalue()
102
+
103
+ # For PDFs, we need to handle differently
104
+ if file_type == "pdf":
105
+ progress_reporter.update(20, "Converting PDF to images...")
106
+
107
+ # Process PDF with direct handling
108
+ progress_reporter.update(30, "Processing PDF with OCR...")
109
+
110
+ # Create a temporary file for processing
111
+ temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
112
+ with open(temp_path, 'wb') as f:
113
+ f.write(file_bytes)
114
+ temp_file_paths.append(temp_path)
115
+
116
+ # Generate cache key
117
+ cache_key = generate_cache_key(
118
+ file_bytes,
119
+ file_type,
120
+ use_vision,
121
+ preprocessing_options,
122
+ pdf_rotation,
123
+ custom_prompt
124
+ )
125
+
126
+ # Process with cached function if possible
127
+ try:
128
+ result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
129
+ progress_reporter.update(90, "Finalizing results...")
130
+ except Exception as e:
131
+ logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
132
+ progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
133
+
134
+ # If caching fails, process directly
135
+ processor = StructuredOCR()
136
+
137
+ # Apply performance mode settings
138
+ if perf_mode == "Speed":
139
+ # Override settings for faster processing
140
+ if pdf_dpi > 100:
141
+ pdf_dpi = 100 # Lower DPI for speed
142
+
143
+ # Process directly with optimized settings
144
+ result = processor.process_file(
145
+ file_path=temp_path,
146
+ file_type="pdf",
147
+ use_vision=use_vision,
148
+ custom_prompt=custom_prompt,
149
+ file_size_mb=file_size_mb,
150
+ pdf_rotation=pdf_rotation
151
+ )
152
+
153
+ progress_reporter.update(90, "Finalizing results...")
154
+ else:
155
+ # For image files
156
+ progress_reporter.update(20, "Preparing image for processing...")
157
+
158
+ # Apply preprocessing if needed
159
+ temp_path, preprocessing_applied = apply_preprocessing_to_file(
160
+ file_bytes,
161
+ file_ext,
162
+ preprocessing_options,
163
+ temp_file_paths
164
+ )
165
+
166
+ if preprocessing_applied:
167
+ progress_reporter.update(30, "Applied image preprocessing...")
168
+
169
+ # Generate cache key
170
+ cache_key = generate_cache_key(
171
+ open(temp_path, 'rb').read(),
172
+ file_type,
173
+ use_vision,
174
+ preprocessing_options,
175
+ 0, # No rotation for images (handled in preprocessing)
176
+ custom_prompt
177
+ )
178
+
179
+ # Process the file using cached function if possible
180
+ progress_reporter.update(50, "Processing document with OCR...")
181
+ try:
182
+ result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
183
+ progress_reporter.update(80, "Analyzing document structure...")
184
+ progress_reporter.update(90, "Finalizing results...")
185
+ except Exception as e:
186
+ logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
187
+ progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
188
+
189
+ # If caching fails, process directly
190
+ processor = StructuredOCR()
191
+
192
+ # Apply performance mode settings
193
+ if perf_mode == "Speed":
194
+ # Use simpler processing for speed
195
+ pass # Any speed optimizations would be handled by the StructuredOCR class
196
+
197
+ result = processor.process_file(
198
+ file_path=temp_path,
199
+ file_type=file_type,
200
+ use_vision=use_vision,
201
+ custom_prompt=custom_prompt,
202
+ file_size_mb=file_size_mb
203
+ )
204
+
205
+ progress_reporter.update(90, "Finalizing results...")
206
+
207
+ # Add additional metadata to result
208
+ result = process_result(result, uploaded_file, preprocessing_options)
209
+
210
+ # Complete progress
211
+ progress_reporter.complete()
212
+
213
+ return result
214
+ except Exception as e:
215
+ # Handle errors
216
+ error_message = handle_ocr_error(e, progress_reporter)
217
+
218
+ # Return error result
219
+ return {
220
+ "file_name": uploaded_file.name,
221
+ "topics": ["Document"],
222
+ "languages": ["English"],
223
+ "error": error_message,
224
+ "ocr_contents": {
225
+ "error": f"Failed to process file: {error_message}",
226
+ "partial_text": "Document could not be processed due to an error."
227
+ }
228
+ }
229
+ finally:
230
+ # Clean up temporary files
231
+ for temp_path in temp_file_paths:
232
+ try:
233
+ if os.path.exists(temp_path):
234
+ os.unlink(temp_path)
235
+ logger.info(f"Removed temporary file: {temp_path}")
236
+ except Exception as e:
237
+ logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
238
+
239
+ def process_result(result, uploaded_file, preprocessing_options=None):
240
+ """
241
+ Process OCR result to add metadata, tags, etc.
242
+
243
+ Args:
244
+ result: OCR result dictionary
245
+ uploaded_file: The uploaded file
246
+ preprocessing_options: Dictionary of preprocessing options
247
+
248
+ Returns:
249
+ dict: Processed OCR result
250
+ """
251
+ # Add timestamp
252
+ result['timestamp'] = format_timestamp()
253
+
254
+ # Add processing time if not already present
255
+ if 'processing_time' not in result:
256
+ result['processing_time'] = 0.0
257
+
258
+ # Generate descriptive filename
259
+ file_ext = Path(uploaded_file.name).suffix.lower()
260
+ result['descriptive_file_name'] = create_descriptive_filename(
261
+ uploaded_file.name,
262
+ result,
263
+ file_ext,
264
+ preprocessing_options
265
+ )
266
+
267
+ # Extract raw text from OCR contents
268
+ raw_text = ""
269
+ if 'ocr_contents' in result:
270
+ if 'raw_text' in result['ocr_contents']:
271
+ raw_text = result['ocr_contents']['raw_text']
272
+ elif 'content' in result['ocr_contents']:
273
+ raw_text = result['ocr_contents']['content']
274
+
275
+ # Extract subject tags if not already present or enhance existing ones
276
+ if 'topics' not in result or not result['topics']:
277
+ result['topics'] = extract_subject_tags(result, raw_text, preprocessing_options)
278
+
279
+ return result
preprocessing.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import cv2
4
+ import numpy as np
5
+ import tempfile
6
+ from PIL import Image, ImageEnhance, ImageFilter
7
+ from pdf2image import convert_from_bytes
8
+ import streamlit as st
9
+ import logging
10
+
11
+ # Configure logging
12
+ logger = logging.getLogger("preprocessing")
13
+ logger.setLevel(logging.INFO)
14
+
15
+ @st.cache_data(ttl=24*3600, show_spinner=False) # Cache for 24 hours
16
+ def convert_pdf_to_images(pdf_bytes, dpi=150, rotation=0):
17
+ """Convert PDF bytes to a list of images with caching"""
18
+ try:
19
+ images = convert_from_bytes(pdf_bytes, dpi=dpi)
20
+
21
+ # Apply rotation if specified
22
+ if rotation != 0 and images:
23
+ rotated_images = []
24
+ for img in images:
25
+ rotated_img = img.rotate(rotation, expand=True, resample=Image.BICUBIC)
26
+ rotated_images.append(rotated_img)
27
+ return rotated_images
28
+
29
+ return images
30
+ except Exception as e:
31
+ st.error(f"Error converting PDF: {str(e)}")
32
+ logger.error(f"PDF conversion error: {str(e)}")
33
+ return []
34
+
35
+ @st.cache_data(ttl=24*3600, show_spinner=False, hash_funcs={dict: lambda x: str(sorted(x.items()))})
36
+ def preprocess_image(image_bytes, preprocessing_options):
37
+ """Preprocess image with selected options optimized for historical document OCR quality"""
38
+ # Setup basic console logging
39
+ logger = logging.getLogger("image_preprocessor")
40
+ logger.setLevel(logging.INFO)
41
+
42
+ # Log which preprocessing options are being applied
43
+ logger.info(f"Preprocessing image with options: {preprocessing_options}")
44
+
45
+ # Convert bytes to PIL Image
46
+ image = Image.open(io.BytesIO(image_bytes))
47
+
48
+ # Check for alpha channel (RGBA) and convert to RGB if needed
49
+ if image.mode == 'RGBA':
50
+ # Convert RGBA to RGB by compositing the image onto a white background
51
+ background = Image.new('RGB', image.size, (255, 255, 255))
52
+ background.paste(image, mask=image.split()[3]) # 3 is the alpha channel
53
+ image = background
54
+ logger.info("Converted RGBA image to RGB")
55
+ elif image.mode not in ('RGB', 'L'):
56
+ # Convert other modes to RGB as well
57
+ image = image.convert('RGB')
58
+ logger.info(f"Converted {image.mode} image to RGB")
59
+
60
+ # Apply rotation if specified
61
+ if preprocessing_options.get("rotation", 0) != 0:
62
+ rotation_degrees = preprocessing_options.get("rotation")
63
+ image = image.rotate(rotation_degrees, expand=True, resample=Image.BICUBIC)
64
+
65
+ # Resize large images while preserving details important for OCR
66
+ width, height = image.size
67
+ max_dimension = max(width, height)
68
+
69
+ # Less aggressive resizing to preserve document details
70
+ if max_dimension > 2500:
71
+ scale_factor = 2500 / max_dimension
72
+ new_width = int(width * scale_factor)
73
+ new_height = int(height * scale_factor)
74
+ # Use LANCZOS for better quality preservation
75
+ image = image.resize((new_width, new_height), Image.LANCZOS)
76
+
77
+ img_array = np.array(image)
78
+
79
+ # Apply preprocessing based on selected options with settings optimized for historical documents
80
+ document_type = preprocessing_options.get("document_type", "standard")
81
+
82
+ # Process grayscale option first as it's a common foundation
83
+ if preprocessing_options.get("grayscale", False):
84
+ if len(img_array.shape) == 3: # Only convert if it's not already grayscale
85
+ if document_type == "handwritten":
86
+ # Enhanced grayscale processing for handwritten documents
87
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
88
+ # Apply adaptive histogram equalization to enhance handwriting
89
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
90
+ img_array = clahe.apply(img_array)
91
+ else:
92
+ # Standard grayscale for printed documents
93
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
94
+
95
+ # Convert back to RGB for further processing
96
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
97
+
98
+ if preprocessing_options.get("contrast", 0) != 0:
99
+ contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 100)
100
+ image = Image.fromarray(img_array)
101
+ enhancer = ImageEnhance.Contrast(image)
102
+ image = enhancer.enhance(contrast_factor)
103
+ img_array = np.array(image)
104
+
105
+ if preprocessing_options.get("denoise", False):
106
+ try:
107
+ # Apply appropriate denoising based on document type
108
+ if document_type == "handwritten":
109
+ # Very light denoising for handwritten documents to preserve pen strokes
110
+ if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
111
+ img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 3, 3, 5, 9)
112
+ else: # Grayscale image
113
+ img_array = cv2.fastNlMeansDenoising(img_array, None, 3, 7, 21)
114
+ else:
115
+ # Standard denoising for printed documents
116
+ if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
117
+ img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 5, 5, 7, 21)
118
+ else: # Grayscale image
119
+ img_array = cv2.fastNlMeansDenoising(img_array, None, 5, 7, 21)
120
+ except Exception as e:
121
+ logger.error(f"Denoising error: {str(e)}, falling back to standard processing")
122
+
123
+ # Convert back to PIL Image
124
+ processed_image = Image.fromarray(img_array)
125
+
126
+ # Higher quality for OCR processing
127
+ byte_io = io.BytesIO()
128
+ try:
129
+ # Make sure the image is in RGB mode before saving as JPEG
130
+ if processed_image.mode not in ('RGB', 'L'):
131
+ processed_image = processed_image.convert('RGB')
132
+
133
+ processed_image.save(byte_io, format='JPEG', quality=92, optimize=True)
134
+ byte_io.seek(0)
135
+
136
+ logger.info(f"Preprocessing complete. Original image mode: {image.mode}, processed mode: {processed_image.mode}")
137
+ logger.info(f"Original size: {len(image_bytes)/1024:.1f}KB, processed size: {len(byte_io.getvalue())/1024:.1f}KB")
138
+
139
+ return byte_io.getvalue()
140
+ except Exception as e:
141
+ logger.error(f"Error saving processed image: {str(e)}")
142
+ # Fallback to original image
143
+ logger.info("Using original image as fallback")
144
+ image_io = io.BytesIO()
145
+ image.save(image_io, format='JPEG', quality=92)
146
+ image_io.seek(0)
147
+ return image_io.getvalue()
148
+
149
+ def create_temp_file(content, suffix, temp_file_paths):
150
+ """Create a temporary file and track it for cleanup"""
151
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
152
+ tmp.write(content)
153
+ temp_path = tmp.name
154
+ # Track temporary file for cleanup
155
+ temp_file_paths.append(temp_path)
156
+ logger.info(f"Created temporary file: {temp_path}")
157
+ return temp_path
158
+
159
+ def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, temp_file_paths):
160
+ """Apply preprocessing to file and return path to processed file"""
161
+ # Check if any preprocessing options with boolean values are True, or if any non-boolean values are non-default
162
+ has_preprocessing = (
163
+ preprocessing_options.get("grayscale", False) or
164
+ preprocessing_options.get("denoise", False) or
165
+ preprocessing_options.get("contrast", 0) != 0 or
166
+ preprocessing_options.get("rotation", 0) != 0 or
167
+ preprocessing_options.get("document_type", "standard") != "standard"
168
+ )
169
+
170
+ if has_preprocessing:
171
+ # Apply preprocessing
172
+ processed_bytes = preprocess_image(file_bytes, preprocessing_options)
173
+
174
+ # Save processed image to temp file
175
+ temp_path = create_temp_file(processed_bytes, file_ext, temp_file_paths)
176
+ return temp_path, True # Return path and flag indicating preprocessing was applied
177
+ else:
178
+ # No preprocessing needed, just save the original file
179
+ temp_path = create_temp_file(file_bytes, file_ext, temp_file_paths)
180
+ return temp_path, False # Return path and flag indicating no preprocessing was applied
ui/custom.css CHANGED
@@ -1,395 +1,282 @@
1
- /* Minimal essential styling */
2
 
3
- /* Processing status container */
4
- .processing-status-container {
5
- margin: 10px 0;
6
- padding: 8px 12px;
7
- border-left: 3px solid #5c6bc0;
8
- font-size: 0.9rem;
9
  }
10
 
11
- /* Result card styling */
12
- .previous-results-container {
13
- margin-bottom: 20px;
14
- color: #000000 !important;
15
- background-color: #ffffff !important;
16
  }
17
 
18
- /* Let Streamlit handle text colors based on background */
19
- /* Reset forced white text to use Streamlit defaults */
 
 
 
 
 
 
 
 
20
 
21
- .result-card {
 
 
 
 
 
 
 
 
 
22
  border: 1px solid #e0e0e0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  border-radius: 4px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  padding: 15px;
25
  margin-bottom: 15px;
 
 
 
 
 
 
 
26
  }
27
 
28
  .result-header {
29
  display: flex;
30
  justify-content: space-between;
31
  margin-bottom: 10px;
32
- padding-bottom: 5px;
33
- border-bottom: 1px solid #e0e0e0;
34
  }
35
 
36
  .result-filename {
37
  font-weight: bold;
38
- font-size: 1.1rem;
39
  }
40
 
41
  .result-date {
42
- font-size: 0.9rem;
43
  color: #666;
 
44
  }
45
 
46
  .result-metadata {
47
- display: flex;
48
- flex-wrap: wrap;
49
- gap: 8px;
50
- margin-bottom: 10px;
51
  }
52
 
53
  .result-tag {
54
- background-color: #e3f2fd;
55
- border-radius: 16px;
56
- padding: 3px 10px;
57
- font-size: 0.85rem;
58
- color: #1565c0;
 
 
59
  }
60
 
61
  .selected-result-container {
62
- border: 1px solid #e0e0e0;
63
- border-radius: 4px;
64
  padding: 20px;
65
- margin: 15px 0;
 
 
66
  }
67
 
68
  .selected-result-title {
69
- font-size: 1.3rem;
70
  font-weight: bold;
71
- margin-bottom: 15px;
72
  }
73
 
74
- /* Fix for image preprocessing preview */
75
- .stExpander {
76
- overflow: hidden !important;
77
- margin-bottom: 10px !important;
78
  }
79
 
80
- .stExpander img {
81
- max-width: 100% !important;
82
- height: auto !important;
83
- object-fit: contain !important;
84
  }
85
 
86
- /* Additional fixes for image preprocessing preview in expanders */
87
- .streamlit-expanderContent {
88
- overflow: hidden !important;
89
- padding-top: 5px !important;
90
- padding-bottom: 5px !important;
91
  }
92
 
93
- .streamlit-expanderContent img {
94
- max-width: 95% !important;
95
- height: auto !important;
96
- object-fit: contain !important;
97
  }
98
 
99
- /* Compact sidebar expanders */
100
- .stSidebar .stExpander {
101
- margin-top: 0 !important;
102
- margin-bottom: 8px !important;
 
103
  }
104
 
105
- .stSidebar .streamlit-expanderHeader {
106
- font-size: 0.9em !important;
107
- padding: 5px !important;
 
 
 
 
108
  }
109
 
110
- .stSidebar .streamlit-expanderContent {
111
- padding: 5px !important;
112
  }
113
 
114
- /* Better sidebar section spacing */
115
- .stSidebar h1, .stSidebar h2, .stSidebar h3, .stSidebar h4, .stSidebar h5 {
116
- margin-top: 15px !important;
117
- margin-bottom: 5px !important;
118
- padding-top: 0 !important;
119
- padding-bottom: 3px !important;
120
- line-height: 1.2 !important;
121
- font-weight: 600 !important;
122
  }
123
 
124
- /* First heading in sidebar doesn't need top margin */
125
- .stSidebar [data-testid="stVerticalBlock"] > div:first-child h5 {
126
- margin-top: 0 !important;
 
 
 
 
 
127
  }
128
 
129
- /* Optimize sidebar checkbox positioning */
130
- .stSidebar .stCheckbox > div > div {
131
- margin-bottom: 3px !important;
132
  }
133
 
134
- /* Metadata container styling */
135
- .metadata-container {
136
- background-color: #f8f9fa;
137
- border-radius: 4px;
138
- padding: 12px;
139
- margin-bottom: 20px;
140
- margin-top: -10px !important; /* Negative margin to reduce gap with header */
141
- border-left: 3px solid #4285f4;
142
- }
143
-
144
- /* Direct child styling to prevent nested containers */
145
- .element-container > .metadata-container {
146
- margin-top: 0 !important;
147
- }
148
-
149
- /* Fix spacing for headings above metadata container */
150
- .element-container h3 + div .metadata-container,
151
- .element-container h1 + div .metadata-container,
152
- .element-container h2 + div .metadata-container,
153
- .stHeading + div div {
154
- margin-top: 0 !important;
155
- }
156
-
157
- /* Fix for subheader and metadata container spacing */
158
- .stHeading ~ div {
159
- margin-top: -10px !important;
160
- }
161
-
162
- /* Remove excess space between metadata heading and content */
163
- .stMarkdown + div div.element-container,
164
- .stMarkdown + div,
165
- .stHeading + div,
166
- .stHeading + div div.element-container,
167
- header + div.stMarkdown + div,
168
- [data-testid="stHeader"] + div,
169
- .stHeading + * {
170
- margin-top: 0 !important;
171
- padding-top: 0 !important;
172
- }
173
-
174
- /* PDF container fixes */
175
- .stExpander .streamlit-expanderContent {
176
- max-width: 100% !important;
177
- overflow: visible !important;
178
- }
179
-
180
- /* Fix placement of fullscreen buttons, especially in expanders */
181
- .element-container .stImage .stExpander button[title="View fullscreen"] {
182
- position: absolute !important;
183
- top: 5px !important;
184
- right: 5px !important;
185
- }
186
-
187
- /* Fix PDF preview container */
188
- .stPdfViewerContent,
189
- .stPdfViewer,
190
- .stPdfViewerPagesContainer {
191
- width: 100% !important;
192
- max-width: 100% !important;
193
- overflow: visible !important;
194
- }
195
-
196
- /* Fix for expandable content */
197
- .stExpander > div[data-testid="stExpander"] {
198
- max-width: 100% !important;
199
- overflow: visible !important;
200
- }
201
-
202
- /* Fix positioning for fullscreen buttons in image containers */
203
- .stImage button[title="View fullscreen"] {
204
- position: absolute !important;
205
- top: 5px !important;
206
- right: 5px !important;
207
- z-index: 1000 !important;
208
- visibility: visible !important;
209
- opacity: 1 !important;
210
- width: 28px !important;
211
- height: 28px !important;
212
- padding: 0 !important;
213
- margin: 0 !important;
214
- background-color: rgba(255, 255, 255, 0.7) !important;
215
- border-radius: 4px !important;
216
- display: flex !important;
217
- align-items: center !important;
218
- justify-content: center !important;
219
- }
220
-
221
- /* Fix fullscreen button styling */
222
- button[title="View fullscreen"],
223
- button.streamlit-expanderHeader {
224
- z-index: 999 !important;
225
- visibility: visible !important;
226
- opacity: 1 !important;
227
- border-radius: 4px !important;
228
- position: absolute !important;
229
- top: 5px !important;
230
- right: 5px !important;
231
- width: 28px !important;
232
- height: 28px !important;
233
- padding: 0 !important;
234
- margin: 0 !important;
235
- background-color: rgba(255, 255, 255, 0.7) !important;
236
- display: flex !important;
237
- align-items: center !important;
238
- justify-content: center !important;
239
- }
240
-
241
- /* Make text visible in Previous Results tab - ensure high contrast */
242
- .previous-results-container h3,
243
- .previous-results-container p,
244
- .previous-results-container .result-filename,
245
- .previous-results-container .result-date,
246
- .previous-results-container .result-tag {
247
- color: #000000 !important;
248
- text-shadow: none !important;
249
- }
250
-
251
- /* No Results styling with proper contrast */
252
- .previous-results-container[style*="text-align: center"] {
253
- background-color: #f0f2f6 !important;
254
- border-radius: 8px !important;
255
- box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
256
- }
257
-
258
- /* Additional image fixes for all containers */
259
- .document-content img,
260
- .markdown-text-container img,
261
- .page-text-content img,
262
- .image-container img,
263
- .streamlit-expanderContent img {
264
- max-width: 100% !important;
265
- height: auto !important;
266
- object-fit: contain !important;
267
- }
268
-
269
- /* Responsive design rules */
270
- /* Specific rules for mobile/small screens */
271
- @media (max-width: 768px) {
272
- .stExpander img,
273
- .document-content img,
274
- .markdown-text-container img,
275
- .page-text-content img,
276
- .image-container img,
277
- .streamlit-expanderContent img {
278
- max-width: 95% !important;
279
- }
280
-
281
- /* Improve responsive layout for example documents */
282
- .stImage,
283
- .css-6qob1r,
284
- .css-zq5wmm,
285
- .css-fg4pbf,
286
- [data-testid="column"],
287
- [data-testid="stHorizontalBlock"] > div {
288
- margin-bottom: 20px !important;
289
- padding: 0 10px !important;
290
- }
291
-
292
- .stImage img {
293
- width: 100% !important;
294
- max-width: 100% !important;
295
- height: auto !important;
296
- object-fit: contain !important;
297
- }
298
-
299
- .stColumnContainer,
300
- .css-jjjwqm,
301
- .css-fg4pbf,
302
- [data-testid="column"] {
303
- gap: 20px !important;
304
- margin-bottom: 20px !important;
305
- }
306
-
307
- /* Force separate columns on mid-sized screens */
308
- [data-testid="stHorizontalBlock"] {
309
- flex-wrap: wrap !important;
310
- }
311
-
312
- [data-testid="stHorizontalBlock"] > div {
313
- min-width: 45% !important;
314
- flex: 1 1 45% !important;
315
- }
316
- }
317
-
318
- /* Modern Streamlit styling - better responsive behavior */
319
- .block-container {
320
- padding-top: 2rem !important;
321
- padding-bottom: 2rem !important;
322
- }
323
-
324
- /* Specific rules for very small screens (mobile) */
325
- @media (max-width: 640px) {
326
- /* Force single column on very small screens */
327
- .row-widget.stHorizontal > div,
328
- div[data-testid="stHorizontalBlock"] > div {
329
- flex-direction: column !important;
330
- width: 100% !important;
331
- }
332
-
333
- /* Critical fix for column display to prevent overlapping */
334
- [data-testid="column"] {
335
- width: 100% !important;
336
- flex: 1 1 100% !important;
337
- padding: 0 !important;
338
- min-width: 100% !important;
339
- max-width: 100% !important;
340
- float: none !important;
341
- clear: both !important;
342
- display: block !important;
343
- }
344
-
345
- /* Enforce correct column layout for Streamlit's container elements */
346
- div[data-testid="stHorizontalBlock"] {
347
- flex-direction: column !important;
348
- display: block !important;
349
- }
350
-
351
- /* Make images more visible on small screens */
352
- .row-widget.stImage img,
353
- [data-testid="stImage"] > img {
354
- max-width: 100% !important;
355
- width: 100% !important;
356
- margin-bottom: 15px !important;
357
- }
358
-
359
- /* Fix example documents grid layout */
360
- .stImage {
361
- display: block !important;
362
- margin-left: auto !important;
363
- margin-right: auto !important;
364
- width: 100% !important;
365
- }
366
- }
367
-
368
- /* Fix image display in grid layout */
369
- .row-widget.stImage,
370
- .css-z5fcl4 {
371
- text-align: center !important;
372
- margin-bottom: 15px !important;
373
- padding: 0 !important;
374
- }
375
-
376
- .row-widget.stImage img,
377
- .css-z5fcl4 img {
378
- max-height: 250px !important;
379
- object-fit: contain !important;
380
- border-radius: 4px !important;
381
- border: 1px solid rgba(0, 0, 0, 0.1) !important;
382
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
383
- }
384
-
385
- /* Better caption styling for images */
386
- .css-z5fcl4 .caption,
387
- .caption,
388
- [data-testid="caption"],
389
- .css-1b0udgb,
390
- .css-183lzff {
391
- margin-top: 5px !important;
392
- font-weight: 500 !important;
393
- text-align: center !important;
394
- font-size: 0.9rem !important;
395
- }
 
1
+ /* Custom CSS for Historical OCR Application */
2
 
3
+ /* Global styles */
4
+ body {
5
+ font-family: 'Source Sans Pro', sans-serif;
6
+ color: #333;
 
 
7
  }
8
 
9
+ /* Header styles */
10
+ h1, h2, h3, h4, h5, h6 {
11
+ font-family: 'Georgia', serif;
12
+ font-weight: 600;
13
+ color: #1E3A8A;
14
  }
15
 
16
+ /* Raw text editor styling */
17
+ .raw-text-editor {
18
+ font-family: 'Courier New', monospace;
19
+ font-size: 14px;
20
+ line-height: 1.5;
21
+ border: 1px solid #ddd;
22
+ border-radius: 4px;
23
+ padding: 10px;
24
+ background-color: #f9f9f9;
25
+ }
26
 
27
+ /* Document content styling */
28
+ .document-content {
29
+ margin-top: 20px;
30
+ }
31
+
32
+ .document-section {
33
+ margin-bottom: 20px;
34
+ padding: 15px;
35
+ background-color: #fff;
36
+ border-radius: 8px;
37
  border: 1px solid #e0e0e0;
38
+ }
39
+
40
+ .document-section h4 {
41
+ margin-top: 0;
42
+ margin-bottom: 10px;
43
+ color: #1E3A8A;
44
+ }
45
+
46
+ /* Subject tag styling */
47
+ .subject-tag {
48
+ display: inline-block;
49
+ padding: 3px 8px;
50
+ border-radius: 12px;
51
+ font-size: 0.85em;
52
+ margin-right: 5px;
53
+ margin-bottom: 5px;
54
+ color: white;
55
+ }
56
+
57
+ .tag-time-period {
58
+ background-color: #1565c0;
59
+ }
60
+
61
+ .tag-language {
62
+ background-color: #00695c;
63
+ }
64
+
65
+ .tag-document-type {
66
+ background-color: #6a1b9a;
67
+ }
68
+
69
+ .tag-subject {
70
+ background-color: #2e7d32;
71
+ }
72
+
73
+ .tag-preprocessing {
74
+ background-color: #e65100;
75
+ }
76
+
77
+ .tag-default {
78
+ background-color: #546e7a;
79
+ }
80
+
81
+ /* Image and text side-by-side styling */
82
+ .image-text-container {
83
+ display: flex;
84
+ gap: 20px;
85
+ margin-bottom: 20px;
86
+ }
87
+
88
+ .image-container {
89
+ flex: 1;
90
+ }
91
+
92
+ .text-container {
93
+ flex: 1;
94
+ }
95
+
96
+ /* Sidebar styling */
97
+ .sidebar-section {
98
+ margin-bottom: 20px;
99
+ }
100
+
101
+ .sidebar-section h3 {
102
+ margin-top: 0;
103
+ margin-bottom: 10px;
104
+ font-size: 16px;
105
+ }
106
+
107
+ /* Button styling */
108
+ .primary-button {
109
+ background-color: #1E88E5;
110
+ color: white;
111
+ border: none;
112
  border-radius: 4px;
113
+ padding: 8px 16px;
114
+ font-weight: 600;
115
+ cursor: pointer;
116
+ transition: background-color 0.2s;
117
+ }
118
+
119
+ .primary-button:hover {
120
+ background-color: #1565C0;
121
+ }
122
+
123
+ .secondary-button {
124
+ background-color: #f8f9fa;
125
+ color: #333;
126
+ border: 1px solid #ddd;
127
+ border-radius: 4px;
128
+ padding: 8px 16px;
129
+ font-weight: 600;
130
+ cursor: pointer;
131
+ transition: background-color 0.2s;
132
+ }
133
+
134
+ .secondary-button:hover {
135
+ background-color: #e9ecef;
136
+ }
137
+
138
+ /* Processing status styling */
139
+ .processing-status {
140
+ padding: 10px 15px;
141
+ border-left: 4px solid #1E88E5;
142
+ background-color: #E3F2FD;
143
+ border-radius: 0 4px 4px 0;
144
+ margin: 10px 0;
145
+ font-size: 14px;
146
+ }
147
+
148
+ /* Previous results styling */
149
+ .previous-results-container {
150
+ margin-top: 20px;
151
+ }
152
+
153
+ .result-card {
154
+ background-color: #f8f9fa;
155
+ border-radius: 8px;
156
  padding: 15px;
157
  margin-bottom: 15px;
158
+ border: 1px solid #e0e0e0;
159
+ transition: all 0.2s ease;
160
+ }
161
+
162
+ .result-card:hover {
163
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1);
164
+ border-color: #c0c0c0;
165
  }
166
 
167
  .result-header {
168
  display: flex;
169
  justify-content: space-between;
170
  margin-bottom: 10px;
 
 
171
  }
172
 
173
  .result-filename {
174
  font-weight: bold;
175
+ font-size: 16px;
176
  }
177
 
178
  .result-date {
 
179
  color: #666;
180
+ font-size: 14px;
181
  }
182
 
183
  .result-metadata {
184
+ margin-top: 10px;
185
+ font-size: 14px;
 
 
186
  }
187
 
188
  .result-tag {
189
+ margin-bottom: 5px;
190
+ color: #555;
191
+ }
192
+
193
+ .result-action-button {
194
+ margin-top: 10px;
195
+ text-align: right;
196
  }
197
 
198
  .selected-result-container {
199
+ margin-top: 30px;
 
200
  padding: 20px;
201
+ background-color: #f0f2f6;
202
+ border-radius: 8px;
203
+ border: 1px solid #d0d7de;
204
  }
205
 
206
  .selected-result-title {
207
+ font-size: 18px;
208
  font-weight: bold;
209
+ color: #1E3A8A;
210
  }
211
 
212
+ /* About tab styling */
213
+ .about-section {
214
+ margin-bottom: 30px;
 
215
  }
216
 
217
+ .about-section h3 {
218
+ color: #1E3A8A;
219
+ margin-bottom: 10px;
 
220
  }
221
 
222
+ .feature-list {
223
+ list-style-type: none;
224
+ padding-left: 0;
 
 
225
  }
226
 
227
+ .feature-list li {
228
+ margin-bottom: 8px;
229
+ padding-left: 20px;
230
+ position: relative;
231
  }
232
 
233
+ .feature-list li:before {
234
+ content: "•";
235
+ position: absolute;
236
+ left: 0;
237
+ color: #1E88E5;
238
  }
239
 
240
+ /* File uploader styling */
241
+ .file-uploader {
242
+ border: 2px dashed #ddd;
243
+ border-radius: 8px;
244
+ padding: 20px;
245
+ text-align: center;
246
+ transition: border-color 0.2s;
247
  }
248
 
249
+ .file-uploader:hover {
250
+ border-color: #1E88E5;
251
  }
252
 
253
+ /* Example documents styling */
254
+ .example-documents {
255
+ margin-top: 20px;
 
 
 
 
 
256
  }
257
 
258
+ .example-card {
259
+ background-color: #f8f9fa;
260
+ border-radius: 8px;
261
+ padding: 15px;
262
+ margin-bottom: 15px;
263
+ border: 1px solid #e0e0e0;
264
+ cursor: pointer;
265
+ transition: all 0.2s ease;
266
  }
267
 
268
+ .example-card:hover {
269
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1);
270
+ border-color: #c0c0c0;
271
  }
272
 
273
+ .example-title {
274
+ font-weight: bold;
275
+ font-size: 16px;
276
+ margin-bottom: 5px;
277
+ }
278
+
279
+ .example-description {
280
+ font-size: 14px;
281
+ color: #555;
282
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/layout.py CHANGED
@@ -1,27 +1,217 @@
1
- """
2
- UI layout components for the OCR application.
3
- """
4
-
5
- import os
6
  import streamlit as st
7
- from pathlib import Path
8
 
9
  def load_css():
10
- """Load custom CSS for the application."""
11
- # Get the directory of the current file
12
- current_dir = Path(os.path.dirname(os.path.abspath(__file__)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Path to the CSS file
15
- css_file = current_dir / "custom.css"
 
 
 
 
 
 
16
 
17
- # Check if the file exists
18
- if not css_file.exists():
19
- st.warning(f"Custom CSS file not found at {css_file}")
20
- return
 
 
 
 
21
 
22
- # Read the CSS content
23
- with open(css_file) as f:
24
- css_content = f.read()
 
 
 
 
 
25
 
26
- # Apply the CSS
27
- st.markdown(f"<style>{css_content}</style>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
 
3
  def load_css():
4
+ """Load custom CSS for the application"""
5
+ st.markdown("""
6
+ <style>
7
+ /* Global styles */
8
+ body {
9
+ font-family: 'Source Sans Pro', sans-serif;
10
+ color: #333;
11
+ }
12
+
13
+ /* Header styles */
14
+ h1, h2, h3, h4, h5, h6 {
15
+ font-family: 'Georgia', serif;
16
+ font-weight: 600;
17
+ color: #1E3A8A;
18
+ }
19
+
20
+ /* Processing status container */
21
+ .processing-status-container {
22
+ padding: 10px 15px;
23
+ border-left: 4px solid #1E88E5;
24
+ background-color: #E3F2FD;
25
+ border-radius: 0 4px 4px 0;
26
+ margin: 10px 0;
27
+ font-size: 14px;
28
+ }
29
+
30
+ /* Previous results styling */
31
+ .previous-results-container {
32
+ margin-top: 20px;
33
+ }
34
+
35
+ .result-card {
36
+ background-color: #f8f9fa;
37
+ border-radius: 8px;
38
+ padding: 15px;
39
+ margin-bottom: 15px;
40
+ border: 1px solid #e0e0e0;
41
+ transition: all 0.2s ease;
42
+ }
43
+
44
+ .result-card:hover {
45
+ box-shadow: 0 4px 8px rgba(0,0,0,0.1);
46
+ border-color: #c0c0c0;
47
+ }
48
+
49
+ .result-header {
50
+ display: flex;
51
+ justify-content: space-between;
52
+ margin-bottom: 10px;
53
+ }
54
+
55
+ .result-filename {
56
+ font-weight: bold;
57
+ font-size: 16px;
58
+ }
59
+
60
+ .result-date {
61
+ color: #666;
62
+ font-size: 14px;
63
+ }
64
+
65
+ .result-metadata {
66
+ margin-top: 10px;
67
+ font-size: 14px;
68
+ }
69
+
70
+ .result-tag {
71
+ margin-bottom: 5px;
72
+ color: #555;
73
+ }
74
+
75
+ .result-action-button {
76
+ margin-top: 10px;
77
+ text-align: right;
78
+ }
79
+
80
+ .selected-result-container {
81
+ margin-top: 30px;
82
+ padding: 20px;
83
+ background-color: #f0f2f6;
84
+ border-radius: 8px;
85
+ border: 1px solid #d0d7de;
86
+ }
87
+
88
+ .selected-result-title {
89
+ font-size: 18px;
90
+ font-weight: bold;
91
+ color: #1E3A8A;
92
+ }
93
+
94
+ /* Raw text editor styling */
95
+ .stTextArea textarea {
96
+ font-family: 'Courier New', monospace;
97
+ font-size: 14px;
98
+ line-height: 1.5;
99
+ }
100
+
101
+ /* Image and text side-by-side styling */
102
+ .image-text-container {
103
+ display: flex;
104
+ gap: 20px;
105
+ margin-bottom: 20px;
106
+ }
107
+
108
+ .image-container {
109
+ flex: 1;
110
+ }
111
+
112
+ .text-container {
113
+ flex: 1;
114
+ }
115
+
116
+ /* Sidebar styling */
117
+ .sidebar .stRadio > div {
118
+ flex-direction: row;
119
+ }
120
+
121
+ .sidebar .stRadio label {
122
+ margin-right: 10px;
123
+ }
124
+
125
+ /* Optimize spacing in sidebar */
126
+ .sidebar .block-container {
127
+ padding-top: 0;
128
+ }
129
+
130
+ .sidebar [data-testid="stVerticalBlock"] {
131
+ gap: 0;
132
+ }
133
+
134
+ /* Button styling */
135
+ .stButton > button {
136
+ border-radius: 4px;
137
+ font-weight: 600;
138
+ }
139
+
140
+ /* File uploader styling */
141
+ .stFileUploader > section > div {
142
+ min-height: 100px;
143
+ }
144
+
145
+ /* Reset vertical text in file uploader */
146
+ .stFileUploader p,
147
+ .stFileUploader span,
148
+ .stFileUploader div p,
149
+ .stFileUploader div span,
150
+ .stFileUploader label p,
151
+ .stFileUploader label span,
152
+ .stFileUploader div[data-testid="stFileUploadDropzone"] p,
153
+ .stFileUploader div[data-testid="stFileUploadDropzone"] span {
154
+ writing-mode: horizontal-tb !important;
155
+ }
156
+
157
+ /* Metadata styling */
158
+ .metadata-card {
159
+ background-color: #f8f9fa;
160
+ border-radius: 8px;
161
+ padding: 15px;
162
+ margin-bottom: 20px;
163
+ border: 1px solid #e0e0e0;
164
+ }
165
+
166
+ /* Document content styling */
167
+ .document-content {
168
+ margin-top: 10px;
169
+ }
170
+
171
+ /* Tab styling */
172
+ .stTabs [data-baseweb="tab-list"] {
173
+ gap: 8px;
174
+ }
175
+
176
+ .stTabs [data-baseweb="tab"] {
177
+ padding: 8px 16px;
178
+ border-radius: 4px 4px 0 0;
179
+ }
180
 
181
+ /* Success message styling */
182
+ .stSuccess {
183
+ background-color: #D4EDDA;
184
+ color: #155724;
185
+ padding: 10px;
186
+ border-radius: 4px;
187
+ border-left: 5px solid #155724;
188
+ }
189
 
190
+ /* Error message styling */
191
+ .stError {
192
+ background-color: #F8D7DA;
193
+ color: #721C24;
194
+ padding: 10px;
195
+ border-radius: 4px;
196
+ border-left: 5px solid #721C24;
197
+ }
198
 
199
+ /* Info message styling */
200
+ .stInfo {
201
+ background-color: #D1ECF1;
202
+ color: #0C5460;
203
+ padding: 10px;
204
+ border-radius: 4px;
205
+ border-left: 5px solid #0C5460;
206
+ }
207
 
208
+ /* Warning message styling */
209
+ .stWarning {
210
+ background-color: #FFF3CD;
211
+ color: #856404;
212
+ padding: 10px;
213
+ border-radius: 4px;
214
+ border-left: 5px solid #856404;
215
+ }
216
+ </style>
217
+ """, unsafe_allow_html=True)
ui_components.py ADDED
@@ -0,0 +1,774 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import io
4
+ import base64
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ import json
8
+ from constants import (
9
+ DOCUMENT_TYPES,
10
+ DOCUMENT_LAYOUTS,
11
+ CUSTOM_PROMPT_TEMPLATES,
12
+ LAYOUT_PROMPT_ADDITIONS,
13
+ DEFAULT_PDF_DPI,
14
+ MIN_PDF_DPI,
15
+ MAX_PDF_DPI,
16
+ DEFAULT_MAX_PAGES,
17
+ PERFORMANCE_MODES,
18
+ PREPROCESSING_DOC_TYPES,
19
+ ROTATION_OPTIONS
20
+ )
21
+ from utils import get_base64_from_image, extract_subject_tags
22
+
23
+ class ProgressReporter:
24
+ """Class to handle progress reporting in the UI"""
25
+
26
+ def __init__(self, placeholder):
27
+ self.placeholder = placeholder
28
+ self.progress_bar = None
29
+ self.status_text = None
30
+
31
+ def setup(self):
32
+ """Setup the progress components"""
33
+ with self.placeholder.container():
34
+ self.progress_bar = st.progress(0)
35
+ self.status_text = st.empty()
36
+ return self
37
+
38
+ def update(self, percent, status_text):
39
+ """Update the progress bar and status text"""
40
+ if self.progress_bar is not None:
41
+ self.progress_bar.progress(percent / 100)
42
+ if self.status_text is not None:
43
+ self.status_text.text(status_text)
44
+
45
+ def complete(self, success=True):
46
+ """Complete the progress reporting"""
47
+ if success:
48
+ if self.progress_bar is not None:
49
+ self.progress_bar.progress(100)
50
+ if self.status_text is not None:
51
+ self.status_text.text("Processing complete!")
52
+ else:
53
+ if self.status_text is not None:
54
+ self.status_text.text("Processing failed.")
55
+
56
+ # Clear the progress components after a delay
57
+ import time
58
+ time.sleep(0.8) # Short delay to show completion
59
+ if self.progress_bar is not None:
60
+ self.progress_bar.empty()
61
+ if self.status_text is not None:
62
+ self.status_text.empty()
63
+
64
+ def create_sidebar_options():
65
+ """Create and return sidebar options"""
66
+ with st.sidebar:
67
+ st.title("OCR Settings")
68
+
69
+ # Create a container for the sidebar options
70
+ with st.container():
71
+ # Model selection
72
+ st.subheader("Model Selection")
73
+ use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
74
+
75
+ # Performance mode
76
+ perf_mode = st.radio("Performance Mode", PERFORMANCE_MODES,
77
+ horizontal=True,
78
+ help="Quality: Best results but slower. Speed: Faster but may be less accurate.")
79
+
80
+ # Document type selection
81
+ st.subheader("Document Type")
82
+ doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
83
+ help="Select the type of document you're processing for better results")
84
+
85
+ # Document layout
86
+ doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS,
87
+ help="Select the layout of your document")
88
+
89
+ # Custom prompt
90
+ custom_prompt = ""
91
+ if doc_type != DOCUMENT_TYPES[0]: # Not auto-detect
92
+ # Get the template for the selected document type
93
+ prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "")
94
+
95
+ # Add layout information if not standard
96
+ if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout
97
+ layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "")
98
+ if layout_addition:
99
+ prompt_template += " " + layout_addition
100
+
101
+ # Set the custom prompt
102
+ custom_prompt = prompt_template
103
+
104
+ # Allow user to edit the prompt
105
+ st.markdown("**Custom Processing Instructions**")
106
+ custom_prompt = st.text_area("", value=custom_prompt,
107
+ help="Customize the instructions for processing this document",
108
+ height=100)
109
+
110
+ # Image preprocessing options
111
+ st.subheader("Image Preprocessing")
112
+
113
+ # Document type for preprocessing
114
+ preprocessing_doc_type = st.radio("Document Type",
115
+ PREPROCESSING_DOC_TYPES,
116
+ horizontal=True,
117
+ help="Select the type of document for preprocessing")
118
+
119
+ # Grayscale conversion
120
+ grayscale = st.checkbox("Convert to Grayscale",
121
+ value=False,
122
+ help="Convert color images to grayscale for better OCR")
123
+
124
+ # Denoise
125
+ denoise = st.checkbox("Denoise Image",
126
+ value=False,
127
+ help="Remove noise from the image")
128
+
129
+ # Contrast adjustment
130
+ contrast = st.slider("Contrast Adjustment",
131
+ min_value=-50,
132
+ max_value=50,
133
+ value=0,
134
+ step=10,
135
+ help="Adjust image contrast")
136
+
137
+ # Rotation
138
+ rotation = st.slider("Rotation",
139
+ min_value=-45,
140
+ max_value=45,
141
+ value=0,
142
+ step=5,
143
+ help="Rotate image if needed")
144
+
145
+ # Create preprocessing options dictionary
146
+ preprocessing_options = {
147
+ "document_type": preprocessing_doc_type,
148
+ "grayscale": grayscale,
149
+ "denoise": denoise,
150
+ "contrast": contrast,
151
+ "rotation": rotation
152
+ }
153
+
154
+ # PDF-specific options
155
+ st.subheader("PDF Options")
156
+ pdf_dpi = st.slider("PDF Resolution (DPI)",
157
+ min_value=MIN_PDF_DPI,
158
+ max_value=MAX_PDF_DPI,
159
+ value=DEFAULT_PDF_DPI,
160
+ step=25,
161
+ help="Higher DPI gives better quality but slower processing")
162
+
163
+ max_pages = st.number_input("Maximum Pages to Process",
164
+ min_value=1,
165
+ max_value=20,
166
+ value=DEFAULT_MAX_PAGES,
167
+ help="Limit the number of pages to process (for multi-page PDFs)")
168
+
169
+ pdf_rotation = st.radio("PDF Rotation", ROTATION_OPTIONS,
170
+ horizontal=True,
171
+ format_func=lambda x: f"{x}°",
172
+ help="Rotate PDF pages if needed")
173
+
174
+ # Create options dictionary
175
+ options = {
176
+ "use_vision": use_vision,
177
+ "perf_mode": perf_mode,
178
+ "pdf_dpi": pdf_dpi,
179
+ "max_pages": max_pages,
180
+ "pdf_rotation": pdf_rotation,
181
+ "custom_prompt": custom_prompt,
182
+ "preprocessing_options": preprocessing_options
183
+ }
184
+
185
+ return options
186
+
187
+ def create_file_uploader():
188
+ """Create and return a file uploader"""
189
+ # Add app description
190
+ favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
191
+ favicon_base64 = get_base64_from_image(favicon_path)
192
+ st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><img src="data:image/png;base64,{favicon_base64}" width="36" height="36" alt="Scroll Icon"/> <div><h1 style="margin: 0; padding: 20px 0 0 0;">Historical Document OCR</h1></div></div>', unsafe_allow_html=True)
193
+ st.subheader("Made possible by Mistral AI")
194
+
195
+ # Add project framing
196
+ st.markdown("""
197
+ This tool is designed to assist scholars in historical research by extracting text from challenging documents.
198
+ While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
199
+ historical documents, particularly:
200
+
201
+ - **Historical newspapers** with complex layouts and aged text
202
+ - **Handwritten documents** from various time periods
203
+ - **Photos of archival materials** that may be difficult to read
204
+
205
+ Upload a document to get started, or explore the example documents.
206
+ """)
207
+
208
+ # Create file uploader
209
+ uploaded_file = st.file_uploader(
210
+ "Upload a document",
211
+ type=["pdf", "png", "jpg", "jpeg"],
212
+ help="Upload a PDF or image file for OCR processing"
213
+ )
214
+ return uploaded_file
215
+
216
+ def display_results(result, container, custom_prompt=""):
217
+ """Display OCR results in the provided container"""
218
+ with container:
219
+ # Display document metadata
220
+ st.subheader("Document Metadata")
221
+
222
+ # Create columns for metadata
223
+ meta_col1, meta_col2 = st.columns(2)
224
+
225
+ with meta_col1:
226
+ # Display document type and languages
227
+ if 'detected_document_type' in result:
228
+ st.write(f"**Document Type:** {result['detected_document_type']}")
229
+
230
+ if 'languages' in result:
231
+ languages = [lang for lang in result['languages'] if lang is not None]
232
+ if languages:
233
+ st.write(f"**Languages:** {', '.join(languages)}")
234
+
235
+ with meta_col2:
236
+ # Display processing time
237
+ if 'processing_time' in result:
238
+ st.write(f"**Processing Time:** {result['processing_time']:.1f}s")
239
+
240
+ # Display page information for PDFs
241
+ if 'limited_pages' in result:
242
+ st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
243
+
244
+ # Display subject tags if available
245
+ if 'topics' in result and result['topics']:
246
+ st.write("**Subject Tags:**")
247
+ # Create a container with flex display for the tags
248
+ st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
249
+
250
+ # Generate a badge for each tag
251
+ for topic in result['topics']:
252
+ # Create colored badge based on tag category
253
+ badge_color = "#546e7a" # Default color
254
+
255
+ # Assign colors by category
256
+ if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
257
+ badge_color = "#1565c0" # Blue for time periods
258
+ elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
259
+ badge_color = "#00695c" # Teal for languages
260
+ elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
261
+ badge_color = "#6a1b9a" # Purple for document types
262
+ elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
263
+ badge_color = "#2e7d32" # Green for subject domains
264
+ elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
265
+ badge_color = "#e65100" # Orange for preprocessing-related tags
266
+
267
+ st.markdown(
268
+ f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
269
+ f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
270
+ unsafe_allow_html=True
271
+ )
272
+
273
+ # Close the container
274
+ st.markdown('</div>', unsafe_allow_html=True)
275
+
276
+ # Display OCR content
277
+ st.subheader("OCR Content")
278
+
279
+ # Check if we have OCR content
280
+ if 'ocr_contents' in result:
281
+ # Create tabs for different views
282
+ has_images = result.get('has_images', False)
283
+ if has_images:
284
+ content_tab1, content_tab2, content_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
285
+ else:
286
+ content_tab1, content_tab2 = st.tabs(["Structured View", "Raw Text"])
287
+
288
+ with content_tab1:
289
+ # Display structured content
290
+ if isinstance(result['ocr_contents'], dict):
291
+ for section, content in result['ocr_contents'].items():
292
+ if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections
293
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
294
+
295
+ if isinstance(content, str):
296
+ st.write(content)
297
+ elif isinstance(content, list):
298
+ for item in content:
299
+ if isinstance(item, str):
300
+ st.write(f"- {item}")
301
+ else:
302
+ st.write(f"- {str(item)}")
303
+ elif isinstance(content, dict):
304
+ for k, v in content.items():
305
+ st.write(f"**{k}:** {v}")
306
+
307
+ with content_tab2:
308
+ # Display raw text with editing capability
309
+ raw_text = ""
310
+ if 'raw_text' in result['ocr_contents']:
311
+ raw_text = result['ocr_contents']['raw_text']
312
+ elif 'content' in result['ocr_contents']:
313
+ raw_text = result['ocr_contents']['content']
314
+
315
+ # Allow editing of the raw text
316
+ edited_text = st.text_area("Edit Raw Text", raw_text, height=400)
317
+
318
+ # Add a button to copy the edited text to clipboard
319
+ if st.button("Copy to Clipboard"):
320
+ st.success("Text copied to clipboard! (You can paste it elsewhere)")
321
+ # Note: The actual clipboard functionality is handled by the browser
322
+
323
+ # Add a download button for the edited text
324
+ st.download_button(
325
+ label="Download Edited Text",
326
+ data=edited_text,
327
+ file_name=f"{result.get('file_name', 'document').split('.')[0]}_edited.txt",
328
+ mime="text/plain"
329
+ )
330
+
331
+ if has_images and 'pages_data' in result:
332
+ with content_tab3:
333
+ # Use the display_document_with_images function
334
+ display_document_with_images(result)
335
+
336
+ # Display custom prompt if provided
337
+ if custom_prompt:
338
+ with st.expander("Custom Processing Instructions"):
339
+ st.write(custom_prompt)
340
+
341
+ # Add download buttons
342
+ st.subheader("Download Results")
343
+
344
+ # Create columns for download buttons
345
+ download_col1, download_col2 = st.columns(2)
346
+
347
+ with download_col1:
348
+ # JSON download
349
+ try:
350
+ json_str = json.dumps(result, indent=2)
351
+ st.download_button(
352
+ label="Download JSON",
353
+ data=json_str,
354
+ file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.json",
355
+ mime="application/json"
356
+ )
357
+ except Exception as e:
358
+ st.error(f"Error creating JSON download: {str(e)}")
359
+
360
+ with download_col2:
361
+ # Text download
362
+ try:
363
+ if 'ocr_contents' in result:
364
+ if 'raw_text' in result['ocr_contents']:
365
+ text_content = result['ocr_contents']['raw_text']
366
+ elif 'content' in result['ocr_contents']:
367
+ text_content = result['ocr_contents']['content']
368
+ else:
369
+ text_content = str(result['ocr_contents'])
370
+ else:
371
+ text_content = "No text content available."
372
+
373
+ st.download_button(
374
+ label="Download Text",
375
+ data=text_content,
376
+ file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.txt",
377
+ mime="text/plain"
378
+ )
379
+ except Exception as e:
380
+ st.error(f"Error creating text download: {str(e)}")
381
+
382
+ def display_document_with_images(result):
383
+ """Display document with images"""
384
+ if 'pages_data' not in result:
385
+ st.info("No image data available.")
386
+ return
387
+
388
+ # Display each page
389
+ for i, page_data in enumerate(result['pages_data']):
390
+ st.markdown(f"### Page {i+1}")
391
+
392
+ # Create columns for image and text
393
+ img_col, text_col = st.columns([1, 1])
394
+
395
+ with img_col:
396
+ # Display the image
397
+ if 'image_data' in page_data:
398
+ try:
399
+ # Convert base64 to image
400
+ image_data = base64.b64decode(page_data['image_data'])
401
+ st.image(io.BytesIO(image_data), use_column_width=True)
402
+ except Exception as e:
403
+ st.error(f"Error displaying image: {str(e)}")
404
+ else:
405
+ st.info("No image available for this page.")
406
+
407
+ with text_col:
408
+ # Display the text with editing capability
409
+ if 'text' in page_data:
410
+ edited_text = st.text_area(f"Page {i+1} Text", page_data['text'], height=300, key=f"page_text_{i}")
411
+
412
+ # Add a button to copy the edited text to clipboard
413
+ if st.button(f"Copy Page {i+1} Text", key=f"copy_btn_{i}"):
414
+ st.success(f"Page {i+1} text copied to clipboard!")
415
+ else:
416
+ st.info("No text available for this page.")
417
+
418
+ def display_previous_results():
419
+ """Display previous results tab content"""
420
+ st.markdown('<h2>Previous Results</h2>', unsafe_allow_html=True)
421
+
422
+ # Load custom CSS for Previous Results tab
423
+ try:
424
+ from ui.layout import load_css
425
+ load_css()
426
+ except ImportError:
427
+ # If ui.layout module is not available, use a simplified version
428
+ st.markdown("""
429
+ <style>
430
+ .previous-results-container {
431
+ margin-top: 20px;
432
+ }
433
+ .result-card {
434
+ background-color: #f8f9fa;
435
+ border-radius: 8px;
436
+ padding: 15px;
437
+ margin-bottom: 15px;
438
+ border: 1px solid #e0e0e0;
439
+ }
440
+ .result-header {
441
+ display: flex;
442
+ justify-content: space-between;
443
+ margin-bottom: 10px;
444
+ }
445
+ .result-filename {
446
+ font-weight: bold;
447
+ font-size: 16px;
448
+ }
449
+ .result-date {
450
+ color: #666;
451
+ font-size: 14px;
452
+ }
453
+ .result-metadata {
454
+ margin-top: 10px;
455
+ font-size: 14px;
456
+ }
457
+ .result-tag {
458
+ margin-bottom: 5px;
459
+ color: #555;
460
+ }
461
+ .result-action-button {
462
+ margin-top: 10px;
463
+ text-align: right;
464
+ }
465
+ .selected-result-container {
466
+ margin-top: 30px;
467
+ padding: 20px;
468
+ background-color: #f0f2f6;
469
+ border-radius: 8px;
470
+ }
471
+ .selected-result-title {
472
+ font-size: 18px;
473
+ font-weight: bold;
474
+ }
475
+ </style>
476
+ """, unsafe_allow_html=True)
477
+
478
+ # Display previous results if available
479
+ if not st.session_state.previous_results:
480
+ st.markdown("""
481
+ <div class="previous-results-container" style="text-align: center; padding: 40px 20px; background-color: #f0f2f6; border-radius: 8px;">
482
+ <div style="font-size: 48px; margin-bottom: 20px;">📄</div>
483
+ <h3 style="margin-bottom: 10px; font-weight: 600;">No Previous Results</h3>
484
+ <p style="font-size: 16px;">Process a document to see your results history saved here.</p>
485
+ </div>
486
+ """, unsafe_allow_html=True)
487
+ else:
488
+ # Create a container for the results list
489
+ st.markdown('<div class="previous-results-container">', unsafe_allow_html=True)
490
+ st.markdown(f'<h3>{len(st.session_state.previous_results)} Previous Results</h3>', unsafe_allow_html=True)
491
+
492
+ # Create two columns for filters and download buttons
493
+ filter_col, download_col = st.columns([2, 1])
494
+
495
+ with filter_col:
496
+ # Add filter options
497
+ filter_options = ["All Types"]
498
+ if any(result.get("file_name", "").lower().endswith(".pdf") for result in st.session_state.previous_results):
499
+ filter_options.append("PDF Documents")
500
+ if any(result.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png")) for result in st.session_state.previous_results):
501
+ filter_options.append("Images")
502
+
503
+ selected_filter = st.selectbox("Filter by Type:", filter_options)
504
+
505
+ with download_col:
506
+ # Add download all button for results
507
+ if len(st.session_state.previous_results) > 0:
508
+ try:
509
+ # Create buffer in memory instead of file on disk
510
+ import io
511
+ from ocr_utils import create_results_zip_in_memory
512
+
513
+ # Get zip data directly in memory
514
+ zip_data = create_results_zip_in_memory(st.session_state.previous_results)
515
+
516
+ # Create more informative ZIP filename with timestamp
517
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
518
+
519
+ # Count document types for a more descriptive filename
520
+ pdf_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith('.pdf'))
521
+ img_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
522
+
523
+ # Create more descriptive filename
524
+ if pdf_count > 0 and img_count > 0:
525
+ zip_filename = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
526
+ elif pdf_count > 0:
527
+ zip_filename = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
528
+ elif img_count > 0:
529
+ zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip"
530
+ else:
531
+ zip_filename = f"historical_ocr_results_{timestamp}.zip"
532
+
533
+ st.download_button(
534
+ label="Download All Results",
535
+ data=zip_data,
536
+ file_name=zip_filename,
537
+ mime="application/zip",
538
+ help="Download all previous results as a ZIP file containing HTML and JSON files"
539
+ )
540
+ except Exception as e:
541
+ st.error(f"Error creating download: {str(e)}")
542
+ st.info("Try with fewer results or individual downloads")
543
+
544
+ # Filter results based on selection
545
+ filtered_results = st.session_state.previous_results
546
+ if selected_filter == "PDF Documents":
547
+ filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith(".pdf")]
548
+ elif selected_filter == "Images":
549
+ filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png"))]
550
+
551
+ # Show a message if no results match the filter
552
+ if not filtered_results:
553
+ st.markdown("""
554
+ <div style="text-align: center; padding: 20px; background-color: #f9f9f9; border-radius: 5px; margin: 20px 0;">
555
+ <p>No results match the selected filter.</p>
556
+ </div>
557
+ """, unsafe_allow_html=True)
558
+
559
+ # Display each result as a card
560
+ for i, result in enumerate(filtered_results):
561
+ # Determine file type icon
562
+ file_name = result.get("file_name", f"Document {i+1}")
563
+ file_type_lower = file_name.lower()
564
+
565
+ if file_type_lower.endswith(".pdf"):
566
+ icon = "📄"
567
+ elif file_type_lower.endswith((".jpg", ".jpeg", ".png", ".gif")):
568
+ icon = "🖼️"
569
+ else:
570
+ icon = "📝"
571
+
572
+ # Create a card for each result
573
+ st.markdown(f"""
574
+ <div class="result-card">
575
+ <div class="result-header">
576
+ <div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div>
577
+ <div class="result-date">{result.get('timestamp', 'Unknown')}</div>
578
+ </div>
579
+ <div class="result-metadata">
580
+ <div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
581
+ <div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div>
582
+ </div>
583
+ """, unsafe_allow_html=True)
584
+
585
+ # Add view button inside the card with proper styling
586
+ st.markdown('<div class="result-action-button">', unsafe_allow_html=True)
587
+ if st.button(f"View Document", key=f"view_{i}"):
588
+ # Set the selected result in the session state
589
+ st.session_state.selected_previous_result = st.session_state.previous_results[i]
590
+ # Force a rerun to show the selected result
591
+ st.rerun()
592
+ st.markdown('</div>', unsafe_allow_html=True)
593
+
594
+ # Close the result card
595
+ st.markdown('</div>', unsafe_allow_html=True)
596
+
597
+ # Close the container
598
+ st.markdown('</div>', unsafe_allow_html=True)
599
+
600
+ # Display the selected result if available
601
+ if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
602
+ selected_result = st.session_state.selected_previous_result
603
+
604
+ # Create a styled container for the selected result
605
+ st.markdown(f"""
606
+ <div class="selected-result-container">
607
+ <div class="result-header" style="margin-bottom: 20px;">
608
+ <div class="selected-result-title">Selected Document: {selected_result.get('file_name', 'Unknown')}</div>
609
+ <div class="result-date">{selected_result.get('timestamp', '')}</div>
610
+ </div>
611
+ """, unsafe_allow_html=True)
612
+
613
+ # Display metadata in a styled way
614
+ meta_col1, meta_col2 = st.columns(2)
615
+
616
+ with meta_col1:
617
+ # Display document metadata
618
+ if 'languages' in selected_result:
619
+ languages = [lang for lang in selected_result['languages'] if lang is not None]
620
+ if languages:
621
+ st.write(f"**Languages:** {', '.join(languages)}")
622
+
623
+ if 'topics' in selected_result and selected_result['topics']:
624
+ # Show topics in a more organized way with badges
625
+ st.markdown("**Subject Tags:**")
626
+ # Create a container with flex display for the tags
627
+ st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
628
+
629
+ # Generate a badge for each tag
630
+ for topic in selected_result['topics']:
631
+ # Create colored badge based on tag category
632
+ badge_color = "#546e7a" # Default color
633
+
634
+ # Assign colors by category
635
+ if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
636
+ badge_color = "#1565c0" # Blue for time periods
637
+ elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
638
+ badge_color = "#00695c" # Teal for languages
639
+ elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
640
+ badge_color = "#6a1b9a" # Purple for document types
641
+ elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
642
+ badge_color = "#2e7d32" # Green for subject domains
643
+ elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
644
+ badge_color = "#e65100" # Orange for preprocessing-related tags
645
+
646
+ st.markdown(
647
+ f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
648
+ f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
649
+ unsafe_allow_html=True
650
+ )
651
+
652
+ # Close the container
653
+ st.markdown('</div>', unsafe_allow_html=True)
654
+
655
+ with meta_col2:
656
+ # Display processing metadata
657
+ if 'limited_pages' in selected_result:
658
+ st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages")
659
+
660
+ if 'processing_time' in selected_result:
661
+ proc_time = selected_result['processing_time']
662
+ st.write(f"**Processing Time:** {proc_time:.1f}s")
663
+
664
+ # Create tabs for content display
665
+ has_images = selected_result.get('has_images', False)
666
+ if has_images:
667
+ view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
668
+ else:
669
+ view_tab1, view_tab2 = st.tabs(["Structured View", "Raw Text"])
670
+
671
+ with view_tab1:
672
+ # Display structured content
673
+ if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
674
+ for section, content in selected_result['ocr_contents'].items():
675
+ if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections
676
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
677
+
678
+ if isinstance(content, str):
679
+ st.write(content)
680
+ elif isinstance(content, list):
681
+ for item in content:
682
+ if isinstance(item, str):
683
+ st.write(f"- {item}")
684
+ else:
685
+ st.write(f"- {str(item)}")
686
+ elif isinstance(content, dict):
687
+ for k, v in content.items():
688
+ st.write(f"**{k}:** {v}")
689
+
690
+ with view_tab2:
691
+ # Display raw text with editing capability
692
+ raw_text = ""
693
+ if 'ocr_contents' in selected_result:
694
+ if 'raw_text' in selected_result['ocr_contents']:
695
+ raw_text = selected_result['ocr_contents']['raw_text']
696
+ elif 'content' in selected_result['ocr_contents']:
697
+ raw_text = selected_result['ocr_contents']['content']
698
+
699
+ # Allow editing of the raw text
700
+ edited_text = st.text_area("Edit Raw Text", raw_text, height=400, key="selected_raw_text")
701
+
702
+ # Add a button to copy the edited text to clipboard
703
+ if st.button("Copy to Clipboard", key="selected_copy_btn"):
704
+ st.success("Text copied to clipboard! (You can paste it elsewhere)")
705
+
706
+ # Add a download button for the edited text
707
+ st.download_button(
708
+ label="Download Edited Text",
709
+ data=edited_text,
710
+ file_name=f"{selected_result.get('file_name', 'document').split('.')[0]}_edited.txt",
711
+ mime="text/plain",
712
+ key="selected_download_btn"
713
+ )
714
+
715
+ if has_images and 'pages_data' in selected_result:
716
+ with view_tab3:
717
+ # Use the display_document_with_images function
718
+ display_document_with_images(selected_result)
719
+
720
+ # Close the container
721
+ st.markdown('</div>', unsafe_allow_html=True)
722
+
723
+ # Add a button to close the selected result
724
+ if st.button("Close Selected Document", key="close_selected"):
725
+ # Clear the selected result from session state
726
+ del st.session_state.selected_previous_result
727
+ # Force a rerun to update the view
728
+ st.rerun()
729
+
730
+ def display_about_tab():
731
+ """Display about tab content"""
732
+ st.markdown('<h2>About Historical OCR</h2>', unsafe_allow_html=True)
733
+
734
+ # Add app description
735
+ st.markdown("""
736
+ **Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
737
+
738
+ ### Purpose
739
+
740
+ This tool is designed to assist scholars in historical research by extracting text from challenging documents.
741
+ While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
742
+ historical documents, particularly:
743
+
744
+ - **Historical newspapers** with complex layouts and aged text
745
+ - **Handwritten documents** from various time periods
746
+ - **Photos of archival materials** that may be difficult to read
747
+
748
+ ### Features
749
+
750
+ - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
751
+ - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
752
+ - **Editable Results**: Review and edit extracted text directly in the interface
753
+ - **Structured Content Analysis**: Automatic organization of document content
754
+ - **Multi-language Support**: Process documents in various languages
755
+ - **PDF Processing**: Handle multi-page historical documents
756
+
757
+ ### How to Use
758
+
759
+ 1. Upload a document (PDF or image)
760
+ 2. Select the document type and adjust preprocessing options if needed
761
+ 3. Add custom processing instructions for specialized documents
762
+ 4. Process the document
763
+ 5. Review, edit, and download the results
764
+
765
+ ### Technologies
766
+
767
+ - OCR processing using Mistral AI's advanced document understanding capabilities
768
+ - Image preprocessing with OpenCV
769
+ - PDF handling with pdf2image
770
+ - Web interface with Streamlit
771
+ """)
772
+
773
+ # Add version information
774
+ st.markdown("**Version:** 1.0.0")
utils.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import hashlib
4
+ import time
5
+ import logging
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from functools import wraps
9
+ from constants import CONTENT_THEMES, PERIOD_TAGS, DEFAULT_TAGS, GENERIC_TAGS
10
+
11
+ # Configure logging
12
+ logger = logging.getLogger("utils")
13
+ logger.setLevel(logging.INFO)
14
+
15
+ def get_base64_from_image(image_path):
16
+ """Get base64 string from image file"""
17
+ try:
18
+ with open(image_path, "rb") as img_file:
19
+ return base64.b64encode(img_file.read()).decode('utf-8')
20
+ except Exception as e:
21
+ logger.error(f"Error encoding image to base64: {str(e)}")
22
+ return ""
23
+
24
+ def timing(description):
25
+ """Context manager for timing code execution"""
26
+ class TimingContext:
27
+ def __init__(self, description):
28
+ self.description = description
29
+
30
+ def __enter__(self):
31
+ self.start_time = time.time()
32
+ return self
33
+
34
+ def __exit__(self, exc_type, exc_val, exc_tb):
35
+ end_time = time.time()
36
+ execution_time = end_time - self.start_time
37
+ logger.info(f"{self.description} took {execution_time:.2f} seconds")
38
+ return False
39
+
40
+ return TimingContext(description)
41
+
42
+ def format_timestamp(timestamp=None):
43
+ """Format timestamp for display"""
44
+ if timestamp is None:
45
+ timestamp = datetime.now()
46
+ elif isinstance(timestamp, str):
47
+ try:
48
+ timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
49
+ except ValueError:
50
+ timestamp = datetime.now()
51
+
52
+ return timestamp.strftime("%Y-%m-%d %H:%M")
53
+
54
+ def generate_cache_key(file_bytes, file_type, use_vision, preprocessing_options=None, pdf_rotation=0, custom_prompt=None):
55
+ """
56
+ Generate a cache key for OCR processing
57
+
58
+ Args:
59
+ file_bytes: File content as bytes
60
+ file_type: Type of file (pdf or image)
61
+ use_vision: Whether to use vision model
62
+ preprocessing_options: Dictionary of preprocessing options
63
+ pdf_rotation: PDF rotation value
64
+ custom_prompt: Custom prompt for OCR
65
+
66
+ Returns:
67
+ str: Cache key
68
+ """
69
+ # Generate file hash
70
+ file_hash = hashlib.md5(file_bytes).hexdigest()
71
+
72
+ # Include preprocessing options in cache key
73
+ preprocessing_options_hash = ""
74
+ if preprocessing_options:
75
+ # Add pdf_rotation to preprocessing options to ensure it's part of the cache key
76
+ if pdf_rotation != 0:
77
+ preprocessing_options_with_rotation = preprocessing_options.copy()
78
+ preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation
79
+ preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
80
+ else:
81
+ preprocessing_str = str(sorted(preprocessing_options.items()))
82
+ preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
83
+ elif pdf_rotation != 0:
84
+ # If no preprocessing options but we have rotation, include that in the hash
85
+ preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation}".encode()).hexdigest()
86
+
87
+ # Create base cache key
88
+ cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
89
+
90
+ # Include custom prompt in cache key if provided
91
+ if custom_prompt:
92
+ custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
93
+ cache_key = f"{cache_key}_{custom_prompt_hash}"
94
+
95
+ return cache_key
96
+
97
+ def handle_temp_files(temp_file_paths):
98
+ """
99
+ Clean up temporary files
100
+
101
+ Args:
102
+ temp_file_paths: List of temporary file paths to clean up
103
+ """
104
+ for temp_path in temp_file_paths:
105
+ try:
106
+ if os.path.exists(temp_path):
107
+ os.unlink(temp_path)
108
+ logger.info(f"Removed temporary file: {temp_path}")
109
+ except Exception as e:
110
+ logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
111
+
112
+ def create_descriptive_filename(original_filename, result, file_ext, preprocessing_options=None):
113
+ """
114
+ Create a descriptive filename for the result
115
+
116
+ Args:
117
+ original_filename: Original filename
118
+ result: OCR result dictionary
119
+ file_ext: File extension
120
+ preprocessing_options: Dictionary of preprocessing options
121
+
122
+ Returns:
123
+ str: Descriptive filename
124
+ """
125
+ # Get base name without extension
126
+ original_name = Path(original_filename).stem
127
+
128
+ # Add document type to filename if detected
129
+ doc_type_tag = ""
130
+ if 'detected_document_type' in result:
131
+ doc_type = result['detected_document_type'].lower()
132
+ doc_type_tag = f"_{doc_type.replace(' ', '_')}"
133
+ elif 'topics' in result and result['topics']:
134
+ # Use first tag as document type if not explicitly detected
135
+ doc_type_tag = f"_{result['topics'][0].lower().replace(' ', '_')}"
136
+
137
+ # Add period tag for historical context if available
138
+ period_tag = ""
139
+ if 'topics' in result and result['topics']:
140
+ for tag in result['topics']:
141
+ if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
142
+ period_tag = f"_{tag.lower().replace(' ', '_')}"
143
+ break
144
+
145
+ # Generate final descriptive filename
146
+ descriptive_name = f"{original_name}{doc_type_tag}{period_tag}{file_ext}"
147
+ return descriptive_name
148
+
149
+ def extract_subject_tags(result, raw_text, preprocessing_options=None):
150
+ """
151
+ Extract subject tags from OCR result
152
+
153
+ Args:
154
+ result: OCR result dictionary
155
+ raw_text: Raw text from OCR
156
+ preprocessing_options: Dictionary of preprocessing options
157
+
158
+ Returns:
159
+ list: Subject tags
160
+ """
161
+ subject_tags = []
162
+
163
+ try:
164
+ # Use existing topics as starting point if available
165
+ if 'topics' in result and result['topics']:
166
+ subject_tags = list(result['topics'])
167
+
168
+ # Add document type if detected
169
+ if 'detected_document_type' in result:
170
+ doc_type = result['detected_document_type'].capitalize()
171
+ if doc_type not in subject_tags:
172
+ subject_tags.append(doc_type)
173
+
174
+ # Analyze content for common themes based on keywords
175
+ if raw_text:
176
+ raw_text_lower = raw_text.lower()
177
+ for theme, keywords in CONTENT_THEMES.items():
178
+ if any(keyword in raw_text_lower for keyword in keywords):
179
+ if theme not in subject_tags:
180
+ subject_tags.append(theme)
181
+
182
+ # Add document period tag if date patterns are detected
183
+ if raw_text:
184
+ # Look for years in content
185
+ import re
186
+ year_matches = re.findall(r'\b1[0-9]{3}\b|\b20[0-1][0-9]\b', raw_text)
187
+ if year_matches:
188
+ # Convert to integers
189
+ years = [int(y) for y in year_matches]
190
+ # Get earliest year
191
+ earliest = min(years)
192
+
193
+ # Find the period tag for this year
194
+ for year_range, period_tag in PERIOD_TAGS.items():
195
+ if year_range[0] <= earliest <= year_range[1]:
196
+ if period_tag not in subject_tags:
197
+ subject_tags.append(period_tag)
198
+ break
199
+
200
+ # Add languages as topics if available
201
+ if 'languages' in result and result['languages']:
202
+ for lang in result['languages']:
203
+ if lang and lang not in subject_tags:
204
+ lang_tag = f"{lang} Language"
205
+ subject_tags.append(lang_tag)
206
+
207
+ # Add preprocessing information as tags if preprocessing was applied
208
+ if preprocessing_options:
209
+ preprocessing_methods = []
210
+ if preprocessing_options.get("document_type", "standard") != "standard":
211
+ doc_type = preprocessing_options["document_type"].capitalize()
212
+ preprocessing_tag = f"Enhanced ({doc_type})"
213
+ if preprocessing_tag not in subject_tags:
214
+ subject_tags.append(preprocessing_tag)
215
+
216
+ if preprocessing_options.get("grayscale", False):
217
+ preprocessing_methods.append("Grayscale")
218
+ if preprocessing_options.get("denoise", False):
219
+ preprocessing_methods.append("Denoised")
220
+ if preprocessing_options.get("contrast", 0) != 0:
221
+ contrast_val = preprocessing_options.get("contrast", 0)
222
+ if contrast_val > 0:
223
+ preprocessing_methods.append("Contrast Enhanced")
224
+ else:
225
+ preprocessing_methods.append("Contrast Reduced")
226
+ if preprocessing_options.get("rotation", 0) != 0:
227
+ preprocessing_methods.append("Rotated")
228
+
229
+ # Add a combined preprocessing tag if methods were applied
230
+ if preprocessing_methods:
231
+ prep_tag = "Preprocessed"
232
+ if prep_tag not in subject_tags:
233
+ subject_tags.append(prep_tag)
234
+
235
+ # Add the specific method as a tag if only one was used
236
+ if len(preprocessing_methods) == 1:
237
+ method_tag = preprocessing_methods[0]
238
+ if method_tag not in subject_tags:
239
+ subject_tags.append(method_tag)
240
+
241
+ except Exception as e:
242
+ logger.warning(f"Error generating subject tags: {str(e)}")
243
+ # Fallback tags if extraction fails
244
+ if not subject_tags:
245
+ subject_tags = DEFAULT_TAGS.copy()
246
+
247
+ # Ensure we have at least 3 tags
248
+ while len(subject_tags) < 3:
249
+ for tag in DEFAULT_TAGS:
250
+ if tag not in subject_tags:
251
+ subject_tags.append(tag)
252
+ break
253
+ else:
254
+ # If all default tags are already used, add generic ones
255
+ for tag in GENERIC_TAGS:
256
+ if tag not in subject_tags:
257
+ subject_tags.append(tag)
258
+ break
259
+ else:
260
+ # If we still can't add any more tags, break the loop
261
+ break
262
+
263
+ return subject_tags