Spaces:
Running
Running
Reconcile main with preview-improvements branch, implementing modular structure, raw text editing, and enhanced framing
Browse files- app.py +0 -0
- constants.py +110 -0
- error_handler.py +65 -0
- ocr_processing.py +279 -0
- preprocessing.py +180 -0
- ui/custom.css +222 -335
- ui/layout.py +210 -20
- ui_components.py +774 -0
- utils.py +263 -0
app.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
constants.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Constants for the Historical OCR application.
|
3 |
+
|
4 |
+
This module contains all the constants used throughout the application,
|
5 |
+
making it easier to maintain and update values in one place.
|
6 |
+
"""
|
7 |
+
|
8 |
+
# API limits
|
9 |
+
MAX_FILE_SIZE_MB = 50
|
10 |
+
MAX_PAGES = 20
|
11 |
+
|
12 |
+
# Caching
|
13 |
+
CACHE_TTL_SECONDS = 24 * 3600 # 24 hours
|
14 |
+
MAX_CACHE_ENTRIES = 20
|
15 |
+
|
16 |
+
# Image processing
|
17 |
+
MAX_IMAGE_DIMENSION = 2500
|
18 |
+
IMAGE_QUALITY = 92
|
19 |
+
|
20 |
+
# Document types
|
21 |
+
DOCUMENT_TYPES = [
|
22 |
+
"Auto-detect (standard processing)",
|
23 |
+
"Newspaper or Magazine",
|
24 |
+
"Letter or Correspondence",
|
25 |
+
"Book or Publication",
|
26 |
+
"Form or Legal Document",
|
27 |
+
"Recipe",
|
28 |
+
"Handwritten Document",
|
29 |
+
"Map or Illustration",
|
30 |
+
"Table or Spreadsheet",
|
31 |
+
"Other (specify in instructions)"
|
32 |
+
]
|
33 |
+
|
34 |
+
# Document layouts
|
35 |
+
DOCUMENT_LAYOUTS = [
|
36 |
+
"Standard layout",
|
37 |
+
"Multiple columns",
|
38 |
+
"Table/grid format",
|
39 |
+
"Mixed layout with images"
|
40 |
+
]
|
41 |
+
|
42 |
+
# Preprocessing document types
|
43 |
+
PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]
|
44 |
+
|
45 |
+
# Rotation options
|
46 |
+
ROTATION_OPTIONS = [0, 90, 180, 270]
|
47 |
+
|
48 |
+
# PDF settings
|
49 |
+
DEFAULT_PDF_DPI = 100
|
50 |
+
MIN_PDF_DPI = 72
|
51 |
+
MAX_PDF_DPI = 300
|
52 |
+
DEFAULT_MAX_PAGES = 3
|
53 |
+
|
54 |
+
# Performance modes
|
55 |
+
PERFORMANCE_MODES = ["Quality", "Speed"]
|
56 |
+
|
57 |
+
# Custom prompt templates
|
58 |
+
CUSTOM_PROMPT_TEMPLATES = {
|
59 |
+
"Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
|
60 |
+
"Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
|
61 |
+
"Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
|
62 |
+
"Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
|
63 |
+
"Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
|
64 |
+
"Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
|
65 |
+
"Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
|
66 |
+
"Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
|
67 |
+
"Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
|
68 |
+
}
|
69 |
+
|
70 |
+
# Layout prompt additions
|
71 |
+
LAYOUT_PROMPT_ADDITIONS = {
|
72 |
+
"Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
|
73 |
+
"Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
|
74 |
+
"Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
|
75 |
+
}
|
76 |
+
|
77 |
+
# Content themes for subject tag extraction
|
78 |
+
CONTENT_THEMES = {
|
79 |
+
"Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"],
|
80 |
+
"Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"],
|
81 |
+
"Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"],
|
82 |
+
"Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"],
|
83 |
+
"Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"],
|
84 |
+
"Education": ["education", "school", "university", "college", "learning", "student", "teach"],
|
85 |
+
"Politics": ["government", "political", "policy", "administration", "election", "legislature"],
|
86 |
+
"Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"],
|
87 |
+
"Social": ["society", "community", "social", "culture", "tradition", "customs"],
|
88 |
+
"Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"],
|
89 |
+
"Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"],
|
90 |
+
"Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"],
|
91 |
+
"Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"],
|
92 |
+
"Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"],
|
93 |
+
"Correspondence": ["letter", "mail", "correspondence", "message", "communication"]
|
94 |
+
}
|
95 |
+
|
96 |
+
# Period tags based on year ranges
|
97 |
+
PERIOD_TAGS = {
|
98 |
+
(0, 1799): "Pre-1800s",
|
99 |
+
(1800, 1849): "Early 19th Century",
|
100 |
+
(1850, 1899): "Late 19th Century",
|
101 |
+
(1900, 1949): "Early 20th Century",
|
102 |
+
(1950, 2099): "Modern Era"
|
103 |
+
}
|
104 |
+
|
105 |
+
# Default fallback tags
|
106 |
+
DEFAULT_TAGS = ["Document", "Historical", "Text"]
|
107 |
+
GENERIC_TAGS = ["Archive", "Content", "Record"]
|
108 |
+
|
109 |
+
# UI constants
|
110 |
+
PROGRESS_DELAY = 0.8 # Seconds to show completion message
|
error_handler.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import streamlit as st
|
3 |
+
import time
|
4 |
+
from constants import MAX_FILE_SIZE_MB
|
5 |
+
|
6 |
+
# Configure logging
|
7 |
+
logger = logging.getLogger("error_handler")
|
8 |
+
logger.setLevel(logging.INFO)
|
9 |
+
|
10 |
+
def handle_ocr_error(exception, progress_reporter=None):
|
11 |
+
"""
|
12 |
+
Handle OCR processing errors and provide user-friendly messages
|
13 |
+
|
14 |
+
Args:
|
15 |
+
exception: The exception that occurred
|
16 |
+
progress_reporter: ProgressReporter instance for UI updates
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
str: User-friendly error message
|
20 |
+
"""
|
21 |
+
error_message = str(exception)
|
22 |
+
|
23 |
+
# Complete progress reporting if provided
|
24 |
+
if progress_reporter:
|
25 |
+
progress_reporter.complete(success=False)
|
26 |
+
|
27 |
+
# Check for specific error types and provide helpful user-facing messages
|
28 |
+
if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
|
29 |
+
friendly_message = "The AI service is currently experiencing high demand. Please try again in a few minutes."
|
30 |
+
logger.error(f"Rate limit error: {error_message}")
|
31 |
+
return friendly_message
|
32 |
+
elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
|
33 |
+
friendly_message = "The API usage quota has been reached. Please check your API key and subscription limits."
|
34 |
+
logger.error(f"API quota error: {error_message}")
|
35 |
+
return friendly_message
|
36 |
+
elif "timeout" in error_message.lower() or "timed out" in error_message.lower():
|
37 |
+
friendly_message = "The request timed out. This may be due to a large document or high server load. Please try again or use a smaller document."
|
38 |
+
logger.error(f"Timeout error: {error_message}")
|
39 |
+
return friendly_message
|
40 |
+
elif "file size" in error_message.lower() or "too large" in error_message.lower():
|
41 |
+
friendly_message = f"The file is too large. Maximum file size is {MAX_FILE_SIZE_MB}MB."
|
42 |
+
logger.error(f"File size error: {error_message}")
|
43 |
+
return friendly_message
|
44 |
+
else:
|
45 |
+
# Generic error message for other errors
|
46 |
+
logger.error(f"OCR processing error: {error_message}", exc_info=True)
|
47 |
+
return f"An error occurred during processing: {error_message}"
|
48 |
+
|
49 |
+
def check_file_size(file_bytes):
|
50 |
+
"""
|
51 |
+
Check if file size is within limits
|
52 |
+
|
53 |
+
Args:
|
54 |
+
file_bytes: File content as bytes
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
tuple: (is_valid, file_size_mb, error_message)
|
58 |
+
"""
|
59 |
+
file_size_mb = len(file_bytes) / (1024 * 1024)
|
60 |
+
|
61 |
+
if file_size_mb > MAX_FILE_SIZE_MB:
|
62 |
+
error_message = f"File size {file_size_mb:.2f} MB exceeds limit of {MAX_FILE_SIZE_MB} MB"
|
63 |
+
return False, file_size_mb, error_message
|
64 |
+
|
65 |
+
return True, file_size_mb, None
|
ocr_processing.py
ADDED
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import hashlib
|
3 |
+
import tempfile
|
4 |
+
import streamlit as st
|
5 |
+
import logging
|
6 |
+
import time
|
7 |
+
from datetime import datetime
|
8 |
+
from pathlib import Path
|
9 |
+
from structured_ocr import StructuredOCR
|
10 |
+
from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
|
11 |
+
from preprocessing import apply_preprocessing_to_file
|
12 |
+
from error_handler import handle_ocr_error, check_file_size
|
13 |
+
|
14 |
+
# Configure logging
|
15 |
+
logger = logging.getLogger("ocr_processing")
|
16 |
+
logger.setLevel(logging.INFO)
|
17 |
+
|
18 |
+
@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
|
19 |
+
def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None):
|
20 |
+
"""
|
21 |
+
Cached version of OCR processing to reuse results
|
22 |
+
|
23 |
+
Args:
|
24 |
+
file_path: Path to the file to process
|
25 |
+
file_type: Type of file (pdf or image)
|
26 |
+
use_vision: Whether to use vision model
|
27 |
+
file_size_mb: File size in MB
|
28 |
+
cache_key: Cache key for the file
|
29 |
+
preprocessing_options_hash: Hash of preprocessing options
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
dict: OCR result
|
33 |
+
"""
|
34 |
+
# Initialize OCR processor
|
35 |
+
processor = StructuredOCR()
|
36 |
+
|
37 |
+
# Process the file
|
38 |
+
with timing(f"OCR processing of {file_type} file"):
|
39 |
+
result = processor.process_file(
|
40 |
+
file_path,
|
41 |
+
file_type=file_type,
|
42 |
+
use_vision=use_vision,
|
43 |
+
file_size_mb=file_size_mb
|
44 |
+
)
|
45 |
+
|
46 |
+
return result
|
47 |
+
|
48 |
+
def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_reporter=None,
|
49 |
+
pdf_dpi=150, max_pages=3, pdf_rotation=0, custom_prompt=None, perf_mode="Quality"):
|
50 |
+
"""
|
51 |
+
Process the uploaded file and return the OCR results
|
52 |
+
|
53 |
+
Args:
|
54 |
+
uploaded_file: The uploaded file to process
|
55 |
+
use_vision: Whether to use vision model
|
56 |
+
preprocessing_options: Dictionary of preprocessing options
|
57 |
+
progress_reporter: ProgressReporter instance for UI updates
|
58 |
+
pdf_dpi: DPI for PDF conversion
|
59 |
+
max_pages: Maximum number of pages to process
|
60 |
+
pdf_rotation: PDF rotation value
|
61 |
+
custom_prompt: Custom prompt for OCR
|
62 |
+
perf_mode: Performance mode (Quality or Speed)
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
dict: OCR result
|
66 |
+
"""
|
67 |
+
if preprocessing_options is None:
|
68 |
+
preprocessing_options = {}
|
69 |
+
|
70 |
+
# Create a container for progress indicators if not provided
|
71 |
+
if progress_reporter is None:
|
72 |
+
from ui_components import ProgressReporter
|
73 |
+
progress_reporter = ProgressReporter(st.empty()).setup()
|
74 |
+
|
75 |
+
# Initialize temporary file paths list
|
76 |
+
temp_file_paths = []
|
77 |
+
|
78 |
+
try:
|
79 |
+
# Check if file size exceeds maximum allowed size
|
80 |
+
is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
|
81 |
+
if not is_valid:
|
82 |
+
progress_reporter.complete(success=False)
|
83 |
+
st.error(error_message)
|
84 |
+
return {
|
85 |
+
"file_name": uploaded_file.name,
|
86 |
+
"topics": ["Document"],
|
87 |
+
"languages": ["English"],
|
88 |
+
"error": error_message,
|
89 |
+
"ocr_contents": {
|
90 |
+
"error": error_message,
|
91 |
+
"partial_text": "Document could not be processed due to size limitations."
|
92 |
+
}
|
93 |
+
}
|
94 |
+
|
95 |
+
# Update progress
|
96 |
+
progress_reporter.update(10, "Initializing OCR processor...")
|
97 |
+
|
98 |
+
# Determine file type from extension
|
99 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
100 |
+
file_type = "pdf" if file_ext == ".pdf" else "image"
|
101 |
+
file_bytes = uploaded_file.getvalue()
|
102 |
+
|
103 |
+
# For PDFs, we need to handle differently
|
104 |
+
if file_type == "pdf":
|
105 |
+
progress_reporter.update(20, "Converting PDF to images...")
|
106 |
+
|
107 |
+
# Process PDF with direct handling
|
108 |
+
progress_reporter.update(30, "Processing PDF with OCR...")
|
109 |
+
|
110 |
+
# Create a temporary file for processing
|
111 |
+
temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
|
112 |
+
with open(temp_path, 'wb') as f:
|
113 |
+
f.write(file_bytes)
|
114 |
+
temp_file_paths.append(temp_path)
|
115 |
+
|
116 |
+
# Generate cache key
|
117 |
+
cache_key = generate_cache_key(
|
118 |
+
file_bytes,
|
119 |
+
file_type,
|
120 |
+
use_vision,
|
121 |
+
preprocessing_options,
|
122 |
+
pdf_rotation,
|
123 |
+
custom_prompt
|
124 |
+
)
|
125 |
+
|
126 |
+
# Process with cached function if possible
|
127 |
+
try:
|
128 |
+
result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
|
129 |
+
progress_reporter.update(90, "Finalizing results...")
|
130 |
+
except Exception as e:
|
131 |
+
logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
|
132 |
+
progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
|
133 |
+
|
134 |
+
# If caching fails, process directly
|
135 |
+
processor = StructuredOCR()
|
136 |
+
|
137 |
+
# Apply performance mode settings
|
138 |
+
if perf_mode == "Speed":
|
139 |
+
# Override settings for faster processing
|
140 |
+
if pdf_dpi > 100:
|
141 |
+
pdf_dpi = 100 # Lower DPI for speed
|
142 |
+
|
143 |
+
# Process directly with optimized settings
|
144 |
+
result = processor.process_file(
|
145 |
+
file_path=temp_path,
|
146 |
+
file_type="pdf",
|
147 |
+
use_vision=use_vision,
|
148 |
+
custom_prompt=custom_prompt,
|
149 |
+
file_size_mb=file_size_mb,
|
150 |
+
pdf_rotation=pdf_rotation
|
151 |
+
)
|
152 |
+
|
153 |
+
progress_reporter.update(90, "Finalizing results...")
|
154 |
+
else:
|
155 |
+
# For image files
|
156 |
+
progress_reporter.update(20, "Preparing image for processing...")
|
157 |
+
|
158 |
+
# Apply preprocessing if needed
|
159 |
+
temp_path, preprocessing_applied = apply_preprocessing_to_file(
|
160 |
+
file_bytes,
|
161 |
+
file_ext,
|
162 |
+
preprocessing_options,
|
163 |
+
temp_file_paths
|
164 |
+
)
|
165 |
+
|
166 |
+
if preprocessing_applied:
|
167 |
+
progress_reporter.update(30, "Applied image preprocessing...")
|
168 |
+
|
169 |
+
# Generate cache key
|
170 |
+
cache_key = generate_cache_key(
|
171 |
+
open(temp_path, 'rb').read(),
|
172 |
+
file_type,
|
173 |
+
use_vision,
|
174 |
+
preprocessing_options,
|
175 |
+
0, # No rotation for images (handled in preprocessing)
|
176 |
+
custom_prompt
|
177 |
+
)
|
178 |
+
|
179 |
+
# Process the file using cached function if possible
|
180 |
+
progress_reporter.update(50, "Processing document with OCR...")
|
181 |
+
try:
|
182 |
+
result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
|
183 |
+
progress_reporter.update(80, "Analyzing document structure...")
|
184 |
+
progress_reporter.update(90, "Finalizing results...")
|
185 |
+
except Exception as e:
|
186 |
+
logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
|
187 |
+
progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
|
188 |
+
|
189 |
+
# If caching fails, process directly
|
190 |
+
processor = StructuredOCR()
|
191 |
+
|
192 |
+
# Apply performance mode settings
|
193 |
+
if perf_mode == "Speed":
|
194 |
+
# Use simpler processing for speed
|
195 |
+
pass # Any speed optimizations would be handled by the StructuredOCR class
|
196 |
+
|
197 |
+
result = processor.process_file(
|
198 |
+
file_path=temp_path,
|
199 |
+
file_type=file_type,
|
200 |
+
use_vision=use_vision,
|
201 |
+
custom_prompt=custom_prompt,
|
202 |
+
file_size_mb=file_size_mb
|
203 |
+
)
|
204 |
+
|
205 |
+
progress_reporter.update(90, "Finalizing results...")
|
206 |
+
|
207 |
+
# Add additional metadata to result
|
208 |
+
result = process_result(result, uploaded_file, preprocessing_options)
|
209 |
+
|
210 |
+
# Complete progress
|
211 |
+
progress_reporter.complete()
|
212 |
+
|
213 |
+
return result
|
214 |
+
except Exception as e:
|
215 |
+
# Handle errors
|
216 |
+
error_message = handle_ocr_error(e, progress_reporter)
|
217 |
+
|
218 |
+
# Return error result
|
219 |
+
return {
|
220 |
+
"file_name": uploaded_file.name,
|
221 |
+
"topics": ["Document"],
|
222 |
+
"languages": ["English"],
|
223 |
+
"error": error_message,
|
224 |
+
"ocr_contents": {
|
225 |
+
"error": f"Failed to process file: {error_message}",
|
226 |
+
"partial_text": "Document could not be processed due to an error."
|
227 |
+
}
|
228 |
+
}
|
229 |
+
finally:
|
230 |
+
# Clean up temporary files
|
231 |
+
for temp_path in temp_file_paths:
|
232 |
+
try:
|
233 |
+
if os.path.exists(temp_path):
|
234 |
+
os.unlink(temp_path)
|
235 |
+
logger.info(f"Removed temporary file: {temp_path}")
|
236 |
+
except Exception as e:
|
237 |
+
logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
|
238 |
+
|
239 |
+
def process_result(result, uploaded_file, preprocessing_options=None):
|
240 |
+
"""
|
241 |
+
Process OCR result to add metadata, tags, etc.
|
242 |
+
|
243 |
+
Args:
|
244 |
+
result: OCR result dictionary
|
245 |
+
uploaded_file: The uploaded file
|
246 |
+
preprocessing_options: Dictionary of preprocessing options
|
247 |
+
|
248 |
+
Returns:
|
249 |
+
dict: Processed OCR result
|
250 |
+
"""
|
251 |
+
# Add timestamp
|
252 |
+
result['timestamp'] = format_timestamp()
|
253 |
+
|
254 |
+
# Add processing time if not already present
|
255 |
+
if 'processing_time' not in result:
|
256 |
+
result['processing_time'] = 0.0
|
257 |
+
|
258 |
+
# Generate descriptive filename
|
259 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
260 |
+
result['descriptive_file_name'] = create_descriptive_filename(
|
261 |
+
uploaded_file.name,
|
262 |
+
result,
|
263 |
+
file_ext,
|
264 |
+
preprocessing_options
|
265 |
+
)
|
266 |
+
|
267 |
+
# Extract raw text from OCR contents
|
268 |
+
raw_text = ""
|
269 |
+
if 'ocr_contents' in result:
|
270 |
+
if 'raw_text' in result['ocr_contents']:
|
271 |
+
raw_text = result['ocr_contents']['raw_text']
|
272 |
+
elif 'content' in result['ocr_contents']:
|
273 |
+
raw_text = result['ocr_contents']['content']
|
274 |
+
|
275 |
+
# Extract subject tags if not already present or enhance existing ones
|
276 |
+
if 'topics' not in result or not result['topics']:
|
277 |
+
result['topics'] = extract_subject_tags(result, raw_text, preprocessing_options)
|
278 |
+
|
279 |
+
return result
|
preprocessing.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
import tempfile
|
6 |
+
from PIL import Image, ImageEnhance, ImageFilter
|
7 |
+
from pdf2image import convert_from_bytes
|
8 |
+
import streamlit as st
|
9 |
+
import logging
|
10 |
+
|
11 |
+
# Configure logging
|
12 |
+
logger = logging.getLogger("preprocessing")
|
13 |
+
logger.setLevel(logging.INFO)
|
14 |
+
|
15 |
+
@st.cache_data(ttl=24*3600, show_spinner=False) # Cache for 24 hours
|
16 |
+
def convert_pdf_to_images(pdf_bytes, dpi=150, rotation=0):
|
17 |
+
"""Convert PDF bytes to a list of images with caching"""
|
18 |
+
try:
|
19 |
+
images = convert_from_bytes(pdf_bytes, dpi=dpi)
|
20 |
+
|
21 |
+
# Apply rotation if specified
|
22 |
+
if rotation != 0 and images:
|
23 |
+
rotated_images = []
|
24 |
+
for img in images:
|
25 |
+
rotated_img = img.rotate(rotation, expand=True, resample=Image.BICUBIC)
|
26 |
+
rotated_images.append(rotated_img)
|
27 |
+
return rotated_images
|
28 |
+
|
29 |
+
return images
|
30 |
+
except Exception as e:
|
31 |
+
st.error(f"Error converting PDF: {str(e)}")
|
32 |
+
logger.error(f"PDF conversion error: {str(e)}")
|
33 |
+
return []
|
34 |
+
|
35 |
+
@st.cache_data(ttl=24*3600, show_spinner=False, hash_funcs={dict: lambda x: str(sorted(x.items()))})
|
36 |
+
def preprocess_image(image_bytes, preprocessing_options):
|
37 |
+
"""Preprocess image with selected options optimized for historical document OCR quality"""
|
38 |
+
# Setup basic console logging
|
39 |
+
logger = logging.getLogger("image_preprocessor")
|
40 |
+
logger.setLevel(logging.INFO)
|
41 |
+
|
42 |
+
# Log which preprocessing options are being applied
|
43 |
+
logger.info(f"Preprocessing image with options: {preprocessing_options}")
|
44 |
+
|
45 |
+
# Convert bytes to PIL Image
|
46 |
+
image = Image.open(io.BytesIO(image_bytes))
|
47 |
+
|
48 |
+
# Check for alpha channel (RGBA) and convert to RGB if needed
|
49 |
+
if image.mode == 'RGBA':
|
50 |
+
# Convert RGBA to RGB by compositing the image onto a white background
|
51 |
+
background = Image.new('RGB', image.size, (255, 255, 255))
|
52 |
+
background.paste(image, mask=image.split()[3]) # 3 is the alpha channel
|
53 |
+
image = background
|
54 |
+
logger.info("Converted RGBA image to RGB")
|
55 |
+
elif image.mode not in ('RGB', 'L'):
|
56 |
+
# Convert other modes to RGB as well
|
57 |
+
image = image.convert('RGB')
|
58 |
+
logger.info(f"Converted {image.mode} image to RGB")
|
59 |
+
|
60 |
+
# Apply rotation if specified
|
61 |
+
if preprocessing_options.get("rotation", 0) != 0:
|
62 |
+
rotation_degrees = preprocessing_options.get("rotation")
|
63 |
+
image = image.rotate(rotation_degrees, expand=True, resample=Image.BICUBIC)
|
64 |
+
|
65 |
+
# Resize large images while preserving details important for OCR
|
66 |
+
width, height = image.size
|
67 |
+
max_dimension = max(width, height)
|
68 |
+
|
69 |
+
# Less aggressive resizing to preserve document details
|
70 |
+
if max_dimension > 2500:
|
71 |
+
scale_factor = 2500 / max_dimension
|
72 |
+
new_width = int(width * scale_factor)
|
73 |
+
new_height = int(height * scale_factor)
|
74 |
+
# Use LANCZOS for better quality preservation
|
75 |
+
image = image.resize((new_width, new_height), Image.LANCZOS)
|
76 |
+
|
77 |
+
img_array = np.array(image)
|
78 |
+
|
79 |
+
# Apply preprocessing based on selected options with settings optimized for historical documents
|
80 |
+
document_type = preprocessing_options.get("document_type", "standard")
|
81 |
+
|
82 |
+
# Process grayscale option first as it's a common foundation
|
83 |
+
if preprocessing_options.get("grayscale", False):
|
84 |
+
if len(img_array.shape) == 3: # Only convert if it's not already grayscale
|
85 |
+
if document_type == "handwritten":
|
86 |
+
# Enhanced grayscale processing for handwritten documents
|
87 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
88 |
+
# Apply adaptive histogram equalization to enhance handwriting
|
89 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
|
90 |
+
img_array = clahe.apply(img_array)
|
91 |
+
else:
|
92 |
+
# Standard grayscale for printed documents
|
93 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
94 |
+
|
95 |
+
# Convert back to RGB for further processing
|
96 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
97 |
+
|
98 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
99 |
+
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 100)
|
100 |
+
image = Image.fromarray(img_array)
|
101 |
+
enhancer = ImageEnhance.Contrast(image)
|
102 |
+
image = enhancer.enhance(contrast_factor)
|
103 |
+
img_array = np.array(image)
|
104 |
+
|
105 |
+
if preprocessing_options.get("denoise", False):
|
106 |
+
try:
|
107 |
+
# Apply appropriate denoising based on document type
|
108 |
+
if document_type == "handwritten":
|
109 |
+
# Very light denoising for handwritten documents to preserve pen strokes
|
110 |
+
if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
|
111 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 3, 3, 5, 9)
|
112 |
+
else: # Grayscale image
|
113 |
+
img_array = cv2.fastNlMeansDenoising(img_array, None, 3, 7, 21)
|
114 |
+
else:
|
115 |
+
# Standard denoising for printed documents
|
116 |
+
if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
|
117 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 5, 5, 7, 21)
|
118 |
+
else: # Grayscale image
|
119 |
+
img_array = cv2.fastNlMeansDenoising(img_array, None, 5, 7, 21)
|
120 |
+
except Exception as e:
|
121 |
+
logger.error(f"Denoising error: {str(e)}, falling back to standard processing")
|
122 |
+
|
123 |
+
# Convert back to PIL Image
|
124 |
+
processed_image = Image.fromarray(img_array)
|
125 |
+
|
126 |
+
# Higher quality for OCR processing
|
127 |
+
byte_io = io.BytesIO()
|
128 |
+
try:
|
129 |
+
# Make sure the image is in RGB mode before saving as JPEG
|
130 |
+
if processed_image.mode not in ('RGB', 'L'):
|
131 |
+
processed_image = processed_image.convert('RGB')
|
132 |
+
|
133 |
+
processed_image.save(byte_io, format='JPEG', quality=92, optimize=True)
|
134 |
+
byte_io.seek(0)
|
135 |
+
|
136 |
+
logger.info(f"Preprocessing complete. Original image mode: {image.mode}, processed mode: {processed_image.mode}")
|
137 |
+
logger.info(f"Original size: {len(image_bytes)/1024:.1f}KB, processed size: {len(byte_io.getvalue())/1024:.1f}KB")
|
138 |
+
|
139 |
+
return byte_io.getvalue()
|
140 |
+
except Exception as e:
|
141 |
+
logger.error(f"Error saving processed image: {str(e)}")
|
142 |
+
# Fallback to original image
|
143 |
+
logger.info("Using original image as fallback")
|
144 |
+
image_io = io.BytesIO()
|
145 |
+
image.save(image_io, format='JPEG', quality=92)
|
146 |
+
image_io.seek(0)
|
147 |
+
return image_io.getvalue()
|
148 |
+
|
149 |
+
def create_temp_file(content, suffix, temp_file_paths):
|
150 |
+
"""Create a temporary file and track it for cleanup"""
|
151 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
152 |
+
tmp.write(content)
|
153 |
+
temp_path = tmp.name
|
154 |
+
# Track temporary file for cleanup
|
155 |
+
temp_file_paths.append(temp_path)
|
156 |
+
logger.info(f"Created temporary file: {temp_path}")
|
157 |
+
return temp_path
|
158 |
+
|
159 |
+
def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, temp_file_paths):
|
160 |
+
"""Apply preprocessing to file and return path to processed file"""
|
161 |
+
# Check if any preprocessing options with boolean values are True, or if any non-boolean values are non-default
|
162 |
+
has_preprocessing = (
|
163 |
+
preprocessing_options.get("grayscale", False) or
|
164 |
+
preprocessing_options.get("denoise", False) or
|
165 |
+
preprocessing_options.get("contrast", 0) != 0 or
|
166 |
+
preprocessing_options.get("rotation", 0) != 0 or
|
167 |
+
preprocessing_options.get("document_type", "standard") != "standard"
|
168 |
+
)
|
169 |
+
|
170 |
+
if has_preprocessing:
|
171 |
+
# Apply preprocessing
|
172 |
+
processed_bytes = preprocess_image(file_bytes, preprocessing_options)
|
173 |
+
|
174 |
+
# Save processed image to temp file
|
175 |
+
temp_path = create_temp_file(processed_bytes, file_ext, temp_file_paths)
|
176 |
+
return temp_path, True # Return path and flag indicating preprocessing was applied
|
177 |
+
else:
|
178 |
+
# No preprocessing needed, just save the original file
|
179 |
+
temp_path = create_temp_file(file_bytes, file_ext, temp_file_paths)
|
180 |
+
return temp_path, False # Return path and flag indicating no preprocessing was applied
|
ui/custom.css
CHANGED
@@ -1,395 +1,282 @@
|
|
1 |
-
/*
|
2 |
|
3 |
-
/*
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
border-left: 3px solid #5c6bc0;
|
8 |
-
font-size: 0.9rem;
|
9 |
}
|
10 |
|
11 |
-
/*
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
}
|
17 |
|
18 |
-
/*
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
border: 1px solid #e0e0e0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
border-radius: 4px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
padding: 15px;
|
25 |
margin-bottom: 15px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
}
|
27 |
|
28 |
.result-header {
|
29 |
display: flex;
|
30 |
justify-content: space-between;
|
31 |
margin-bottom: 10px;
|
32 |
-
padding-bottom: 5px;
|
33 |
-
border-bottom: 1px solid #e0e0e0;
|
34 |
}
|
35 |
|
36 |
.result-filename {
|
37 |
font-weight: bold;
|
38 |
-
font-size:
|
39 |
}
|
40 |
|
41 |
.result-date {
|
42 |
-
font-size: 0.9rem;
|
43 |
color: #666;
|
|
|
44 |
}
|
45 |
|
46 |
.result-metadata {
|
47 |
-
|
48 |
-
|
49 |
-
gap: 8px;
|
50 |
-
margin-bottom: 10px;
|
51 |
}
|
52 |
|
53 |
.result-tag {
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
59 |
}
|
60 |
|
61 |
.selected-result-container {
|
62 |
-
|
63 |
-
border-radius: 4px;
|
64 |
padding: 20px;
|
65 |
-
|
|
|
|
|
66 |
}
|
67 |
|
68 |
.selected-result-title {
|
69 |
-
font-size:
|
70 |
font-weight: bold;
|
71 |
-
|
72 |
}
|
73 |
|
74 |
-
/*
|
75 |
-
.
|
76 |
-
|
77 |
-
margin-bottom: 10px !important;
|
78 |
}
|
79 |
|
80 |
-
.
|
81 |
-
|
82 |
-
|
83 |
-
object-fit: contain !important;
|
84 |
}
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
padding-top: 5px !important;
|
90 |
-
padding-bottom: 5px !important;
|
91 |
}
|
92 |
|
93 |
-
.
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
}
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
103 |
}
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
108 |
}
|
109 |
|
110 |
-
.
|
111 |
-
|
112 |
}
|
113 |
|
114 |
-
/*
|
115 |
-
.
|
116 |
-
margin-top:
|
117 |
-
margin-bottom: 5px !important;
|
118 |
-
padding-top: 0 !important;
|
119 |
-
padding-bottom: 3px !important;
|
120 |
-
line-height: 1.2 !important;
|
121 |
-
font-weight: 600 !important;
|
122 |
}
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
127 |
}
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
}
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
/* Direct child styling to prevent nested containers */
|
145 |
-
.element-container > .metadata-container {
|
146 |
-
margin-top: 0 !important;
|
147 |
-
}
|
148 |
-
|
149 |
-
/* Fix spacing for headings above metadata container */
|
150 |
-
.element-container h3 + div .metadata-container,
|
151 |
-
.element-container h1 + div .metadata-container,
|
152 |
-
.element-container h2 + div .metadata-container,
|
153 |
-
.stHeading + div div {
|
154 |
-
margin-top: 0 !important;
|
155 |
-
}
|
156 |
-
|
157 |
-
/* Fix for subheader and metadata container spacing */
|
158 |
-
.stHeading ~ div {
|
159 |
-
margin-top: -10px !important;
|
160 |
-
}
|
161 |
-
|
162 |
-
/* Remove excess space between metadata heading and content */
|
163 |
-
.stMarkdown + div div.element-container,
|
164 |
-
.stMarkdown + div,
|
165 |
-
.stHeading + div,
|
166 |
-
.stHeading + div div.element-container,
|
167 |
-
header + div.stMarkdown + div,
|
168 |
-
[data-testid="stHeader"] + div,
|
169 |
-
.stHeading + * {
|
170 |
-
margin-top: 0 !important;
|
171 |
-
padding-top: 0 !important;
|
172 |
-
}
|
173 |
-
|
174 |
-
/* PDF container fixes */
|
175 |
-
.stExpander .streamlit-expanderContent {
|
176 |
-
max-width: 100% !important;
|
177 |
-
overflow: visible !important;
|
178 |
-
}
|
179 |
-
|
180 |
-
/* Fix placement of fullscreen buttons, especially in expanders */
|
181 |
-
.element-container .stImage .stExpander button[title="View fullscreen"] {
|
182 |
-
position: absolute !important;
|
183 |
-
top: 5px !important;
|
184 |
-
right: 5px !important;
|
185 |
-
}
|
186 |
-
|
187 |
-
/* Fix PDF preview container */
|
188 |
-
.stPdfViewerContent,
|
189 |
-
.stPdfViewer,
|
190 |
-
.stPdfViewerPagesContainer {
|
191 |
-
width: 100% !important;
|
192 |
-
max-width: 100% !important;
|
193 |
-
overflow: visible !important;
|
194 |
-
}
|
195 |
-
|
196 |
-
/* Fix for expandable content */
|
197 |
-
.stExpander > div[data-testid="stExpander"] {
|
198 |
-
max-width: 100% !important;
|
199 |
-
overflow: visible !important;
|
200 |
-
}
|
201 |
-
|
202 |
-
/* Fix positioning for fullscreen buttons in image containers */
|
203 |
-
.stImage button[title="View fullscreen"] {
|
204 |
-
position: absolute !important;
|
205 |
-
top: 5px !important;
|
206 |
-
right: 5px !important;
|
207 |
-
z-index: 1000 !important;
|
208 |
-
visibility: visible !important;
|
209 |
-
opacity: 1 !important;
|
210 |
-
width: 28px !important;
|
211 |
-
height: 28px !important;
|
212 |
-
padding: 0 !important;
|
213 |
-
margin: 0 !important;
|
214 |
-
background-color: rgba(255, 255, 255, 0.7) !important;
|
215 |
-
border-radius: 4px !important;
|
216 |
-
display: flex !important;
|
217 |
-
align-items: center !important;
|
218 |
-
justify-content: center !important;
|
219 |
-
}
|
220 |
-
|
221 |
-
/* Fix fullscreen button styling */
|
222 |
-
button[title="View fullscreen"],
|
223 |
-
button.streamlit-expanderHeader {
|
224 |
-
z-index: 999 !important;
|
225 |
-
visibility: visible !important;
|
226 |
-
opacity: 1 !important;
|
227 |
-
border-radius: 4px !important;
|
228 |
-
position: absolute !important;
|
229 |
-
top: 5px !important;
|
230 |
-
right: 5px !important;
|
231 |
-
width: 28px !important;
|
232 |
-
height: 28px !important;
|
233 |
-
padding: 0 !important;
|
234 |
-
margin: 0 !important;
|
235 |
-
background-color: rgba(255, 255, 255, 0.7) !important;
|
236 |
-
display: flex !important;
|
237 |
-
align-items: center !important;
|
238 |
-
justify-content: center !important;
|
239 |
-
}
|
240 |
-
|
241 |
-
/* Make text visible in Previous Results tab - ensure high contrast */
|
242 |
-
.previous-results-container h3,
|
243 |
-
.previous-results-container p,
|
244 |
-
.previous-results-container .result-filename,
|
245 |
-
.previous-results-container .result-date,
|
246 |
-
.previous-results-container .result-tag {
|
247 |
-
color: #000000 !important;
|
248 |
-
text-shadow: none !important;
|
249 |
-
}
|
250 |
-
|
251 |
-
/* No Results styling with proper contrast */
|
252 |
-
.previous-results-container[style*="text-align: center"] {
|
253 |
-
background-color: #f0f2f6 !important;
|
254 |
-
border-radius: 8px !important;
|
255 |
-
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
|
256 |
-
}
|
257 |
-
|
258 |
-
/* Additional image fixes for all containers */
|
259 |
-
.document-content img,
|
260 |
-
.markdown-text-container img,
|
261 |
-
.page-text-content img,
|
262 |
-
.image-container img,
|
263 |
-
.streamlit-expanderContent img {
|
264 |
-
max-width: 100% !important;
|
265 |
-
height: auto !important;
|
266 |
-
object-fit: contain !important;
|
267 |
-
}
|
268 |
-
|
269 |
-
/* Responsive design rules */
|
270 |
-
/* Specific rules for mobile/small screens */
|
271 |
-
@media (max-width: 768px) {
|
272 |
-
.stExpander img,
|
273 |
-
.document-content img,
|
274 |
-
.markdown-text-container img,
|
275 |
-
.page-text-content img,
|
276 |
-
.image-container img,
|
277 |
-
.streamlit-expanderContent img {
|
278 |
-
max-width: 95% !important;
|
279 |
-
}
|
280 |
-
|
281 |
-
/* Improve responsive layout for example documents */
|
282 |
-
.stImage,
|
283 |
-
.css-6qob1r,
|
284 |
-
.css-zq5wmm,
|
285 |
-
.css-fg4pbf,
|
286 |
-
[data-testid="column"],
|
287 |
-
[data-testid="stHorizontalBlock"] > div {
|
288 |
-
margin-bottom: 20px !important;
|
289 |
-
padding: 0 10px !important;
|
290 |
-
}
|
291 |
-
|
292 |
-
.stImage img {
|
293 |
-
width: 100% !important;
|
294 |
-
max-width: 100% !important;
|
295 |
-
height: auto !important;
|
296 |
-
object-fit: contain !important;
|
297 |
-
}
|
298 |
-
|
299 |
-
.stColumnContainer,
|
300 |
-
.css-jjjwqm,
|
301 |
-
.css-fg4pbf,
|
302 |
-
[data-testid="column"] {
|
303 |
-
gap: 20px !important;
|
304 |
-
margin-bottom: 20px !important;
|
305 |
-
}
|
306 |
-
|
307 |
-
/* Force separate columns on mid-sized screens */
|
308 |
-
[data-testid="stHorizontalBlock"] {
|
309 |
-
flex-wrap: wrap !important;
|
310 |
-
}
|
311 |
-
|
312 |
-
[data-testid="stHorizontalBlock"] > div {
|
313 |
-
min-width: 45% !important;
|
314 |
-
flex: 1 1 45% !important;
|
315 |
-
}
|
316 |
-
}
|
317 |
-
|
318 |
-
/* Modern Streamlit styling - better responsive behavior */
|
319 |
-
.block-container {
|
320 |
-
padding-top: 2rem !important;
|
321 |
-
padding-bottom: 2rem !important;
|
322 |
-
}
|
323 |
-
|
324 |
-
/* Specific rules for very small screens (mobile) */
|
325 |
-
@media (max-width: 640px) {
|
326 |
-
/* Force single column on very small screens */
|
327 |
-
.row-widget.stHorizontal > div,
|
328 |
-
div[data-testid="stHorizontalBlock"] > div {
|
329 |
-
flex-direction: column !important;
|
330 |
-
width: 100% !important;
|
331 |
-
}
|
332 |
-
|
333 |
-
/* Critical fix for column display to prevent overlapping */
|
334 |
-
[data-testid="column"] {
|
335 |
-
width: 100% !important;
|
336 |
-
flex: 1 1 100% !important;
|
337 |
-
padding: 0 !important;
|
338 |
-
min-width: 100% !important;
|
339 |
-
max-width: 100% !important;
|
340 |
-
float: none !important;
|
341 |
-
clear: both !important;
|
342 |
-
display: block !important;
|
343 |
-
}
|
344 |
-
|
345 |
-
/* Enforce correct column layout for Streamlit's container elements */
|
346 |
-
div[data-testid="stHorizontalBlock"] {
|
347 |
-
flex-direction: column !important;
|
348 |
-
display: block !important;
|
349 |
-
}
|
350 |
-
|
351 |
-
/* Make images more visible on small screens */
|
352 |
-
.row-widget.stImage img,
|
353 |
-
[data-testid="stImage"] > img {
|
354 |
-
max-width: 100% !important;
|
355 |
-
width: 100% !important;
|
356 |
-
margin-bottom: 15px !important;
|
357 |
-
}
|
358 |
-
|
359 |
-
/* Fix example documents grid layout */
|
360 |
-
.stImage {
|
361 |
-
display: block !important;
|
362 |
-
margin-left: auto !important;
|
363 |
-
margin-right: auto !important;
|
364 |
-
width: 100% !important;
|
365 |
-
}
|
366 |
-
}
|
367 |
-
|
368 |
-
/* Fix image display in grid layout */
|
369 |
-
.row-widget.stImage,
|
370 |
-
.css-z5fcl4 {
|
371 |
-
text-align: center !important;
|
372 |
-
margin-bottom: 15px !important;
|
373 |
-
padding: 0 !important;
|
374 |
-
}
|
375 |
-
|
376 |
-
.row-widget.stImage img,
|
377 |
-
.css-z5fcl4 img {
|
378 |
-
max-height: 250px !important;
|
379 |
-
object-fit: contain !important;
|
380 |
-
border-radius: 4px !important;
|
381 |
-
border: 1px solid rgba(0, 0, 0, 0.1) !important;
|
382 |
-
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
|
383 |
-
}
|
384 |
-
|
385 |
-
/* Better caption styling for images */
|
386 |
-
.css-z5fcl4 .caption,
|
387 |
-
.caption,
|
388 |
-
[data-testid="caption"],
|
389 |
-
.css-1b0udgb,
|
390 |
-
.css-183lzff {
|
391 |
-
margin-top: 5px !important;
|
392 |
-
font-weight: 500 !important;
|
393 |
-
text-align: center !important;
|
394 |
-
font-size: 0.9rem !important;
|
395 |
-
}
|
|
|
1 |
+
/* Custom CSS for Historical OCR Application */
|
2 |
|
3 |
+
/* Global styles */
|
4 |
+
body {
|
5 |
+
font-family: 'Source Sans Pro', sans-serif;
|
6 |
+
color: #333;
|
|
|
|
|
7 |
}
|
8 |
|
9 |
+
/* Header styles */
|
10 |
+
h1, h2, h3, h4, h5, h6 {
|
11 |
+
font-family: 'Georgia', serif;
|
12 |
+
font-weight: 600;
|
13 |
+
color: #1E3A8A;
|
14 |
}
|
15 |
|
16 |
+
/* Raw text editor styling */
|
17 |
+
.raw-text-editor {
|
18 |
+
font-family: 'Courier New', monospace;
|
19 |
+
font-size: 14px;
|
20 |
+
line-height: 1.5;
|
21 |
+
border: 1px solid #ddd;
|
22 |
+
border-radius: 4px;
|
23 |
+
padding: 10px;
|
24 |
+
background-color: #f9f9f9;
|
25 |
+
}
|
26 |
|
27 |
+
/* Document content styling */
|
28 |
+
.document-content {
|
29 |
+
margin-top: 20px;
|
30 |
+
}
|
31 |
+
|
32 |
+
.document-section {
|
33 |
+
margin-bottom: 20px;
|
34 |
+
padding: 15px;
|
35 |
+
background-color: #fff;
|
36 |
+
border-radius: 8px;
|
37 |
border: 1px solid #e0e0e0;
|
38 |
+
}
|
39 |
+
|
40 |
+
.document-section h4 {
|
41 |
+
margin-top: 0;
|
42 |
+
margin-bottom: 10px;
|
43 |
+
color: #1E3A8A;
|
44 |
+
}
|
45 |
+
|
46 |
+
/* Subject tag styling */
|
47 |
+
.subject-tag {
|
48 |
+
display: inline-block;
|
49 |
+
padding: 3px 8px;
|
50 |
+
border-radius: 12px;
|
51 |
+
font-size: 0.85em;
|
52 |
+
margin-right: 5px;
|
53 |
+
margin-bottom: 5px;
|
54 |
+
color: white;
|
55 |
+
}
|
56 |
+
|
57 |
+
.tag-time-period {
|
58 |
+
background-color: #1565c0;
|
59 |
+
}
|
60 |
+
|
61 |
+
.tag-language {
|
62 |
+
background-color: #00695c;
|
63 |
+
}
|
64 |
+
|
65 |
+
.tag-document-type {
|
66 |
+
background-color: #6a1b9a;
|
67 |
+
}
|
68 |
+
|
69 |
+
.tag-subject {
|
70 |
+
background-color: #2e7d32;
|
71 |
+
}
|
72 |
+
|
73 |
+
.tag-preprocessing {
|
74 |
+
background-color: #e65100;
|
75 |
+
}
|
76 |
+
|
77 |
+
.tag-default {
|
78 |
+
background-color: #546e7a;
|
79 |
+
}
|
80 |
+
|
81 |
+
/* Image and text side-by-side styling */
|
82 |
+
.image-text-container {
|
83 |
+
display: flex;
|
84 |
+
gap: 20px;
|
85 |
+
margin-bottom: 20px;
|
86 |
+
}
|
87 |
+
|
88 |
+
.image-container {
|
89 |
+
flex: 1;
|
90 |
+
}
|
91 |
+
|
92 |
+
.text-container {
|
93 |
+
flex: 1;
|
94 |
+
}
|
95 |
+
|
96 |
+
/* Sidebar styling */
|
97 |
+
.sidebar-section {
|
98 |
+
margin-bottom: 20px;
|
99 |
+
}
|
100 |
+
|
101 |
+
.sidebar-section h3 {
|
102 |
+
margin-top: 0;
|
103 |
+
margin-bottom: 10px;
|
104 |
+
font-size: 16px;
|
105 |
+
}
|
106 |
+
|
107 |
+
/* Button styling */
|
108 |
+
.primary-button {
|
109 |
+
background-color: #1E88E5;
|
110 |
+
color: white;
|
111 |
+
border: none;
|
112 |
border-radius: 4px;
|
113 |
+
padding: 8px 16px;
|
114 |
+
font-weight: 600;
|
115 |
+
cursor: pointer;
|
116 |
+
transition: background-color 0.2s;
|
117 |
+
}
|
118 |
+
|
119 |
+
.primary-button:hover {
|
120 |
+
background-color: #1565C0;
|
121 |
+
}
|
122 |
+
|
123 |
+
.secondary-button {
|
124 |
+
background-color: #f8f9fa;
|
125 |
+
color: #333;
|
126 |
+
border: 1px solid #ddd;
|
127 |
+
border-radius: 4px;
|
128 |
+
padding: 8px 16px;
|
129 |
+
font-weight: 600;
|
130 |
+
cursor: pointer;
|
131 |
+
transition: background-color 0.2s;
|
132 |
+
}
|
133 |
+
|
134 |
+
.secondary-button:hover {
|
135 |
+
background-color: #e9ecef;
|
136 |
+
}
|
137 |
+
|
138 |
+
/* Processing status styling */
|
139 |
+
.processing-status {
|
140 |
+
padding: 10px 15px;
|
141 |
+
border-left: 4px solid #1E88E5;
|
142 |
+
background-color: #E3F2FD;
|
143 |
+
border-radius: 0 4px 4px 0;
|
144 |
+
margin: 10px 0;
|
145 |
+
font-size: 14px;
|
146 |
+
}
|
147 |
+
|
148 |
+
/* Previous results styling */
|
149 |
+
.previous-results-container {
|
150 |
+
margin-top: 20px;
|
151 |
+
}
|
152 |
+
|
153 |
+
.result-card {
|
154 |
+
background-color: #f8f9fa;
|
155 |
+
border-radius: 8px;
|
156 |
padding: 15px;
|
157 |
margin-bottom: 15px;
|
158 |
+
border: 1px solid #e0e0e0;
|
159 |
+
transition: all 0.2s ease;
|
160 |
+
}
|
161 |
+
|
162 |
+
.result-card:hover {
|
163 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
164 |
+
border-color: #c0c0c0;
|
165 |
}
|
166 |
|
167 |
.result-header {
|
168 |
display: flex;
|
169 |
justify-content: space-between;
|
170 |
margin-bottom: 10px;
|
|
|
|
|
171 |
}
|
172 |
|
173 |
.result-filename {
|
174 |
font-weight: bold;
|
175 |
+
font-size: 16px;
|
176 |
}
|
177 |
|
178 |
.result-date {
|
|
|
179 |
color: #666;
|
180 |
+
font-size: 14px;
|
181 |
}
|
182 |
|
183 |
.result-metadata {
|
184 |
+
margin-top: 10px;
|
185 |
+
font-size: 14px;
|
|
|
|
|
186 |
}
|
187 |
|
188 |
.result-tag {
|
189 |
+
margin-bottom: 5px;
|
190 |
+
color: #555;
|
191 |
+
}
|
192 |
+
|
193 |
+
.result-action-button {
|
194 |
+
margin-top: 10px;
|
195 |
+
text-align: right;
|
196 |
}
|
197 |
|
198 |
.selected-result-container {
|
199 |
+
margin-top: 30px;
|
|
|
200 |
padding: 20px;
|
201 |
+
background-color: #f0f2f6;
|
202 |
+
border-radius: 8px;
|
203 |
+
border: 1px solid #d0d7de;
|
204 |
}
|
205 |
|
206 |
.selected-result-title {
|
207 |
+
font-size: 18px;
|
208 |
font-weight: bold;
|
209 |
+
color: #1E3A8A;
|
210 |
}
|
211 |
|
212 |
+
/* About tab styling */
|
213 |
+
.about-section {
|
214 |
+
margin-bottom: 30px;
|
|
|
215 |
}
|
216 |
|
217 |
+
.about-section h3 {
|
218 |
+
color: #1E3A8A;
|
219 |
+
margin-bottom: 10px;
|
|
|
220 |
}
|
221 |
|
222 |
+
.feature-list {
|
223 |
+
list-style-type: none;
|
224 |
+
padding-left: 0;
|
|
|
|
|
225 |
}
|
226 |
|
227 |
+
.feature-list li {
|
228 |
+
margin-bottom: 8px;
|
229 |
+
padding-left: 20px;
|
230 |
+
position: relative;
|
231 |
}
|
232 |
|
233 |
+
.feature-list li:before {
|
234 |
+
content: "•";
|
235 |
+
position: absolute;
|
236 |
+
left: 0;
|
237 |
+
color: #1E88E5;
|
238 |
}
|
239 |
|
240 |
+
/* File uploader styling */
|
241 |
+
.file-uploader {
|
242 |
+
border: 2px dashed #ddd;
|
243 |
+
border-radius: 8px;
|
244 |
+
padding: 20px;
|
245 |
+
text-align: center;
|
246 |
+
transition: border-color 0.2s;
|
247 |
}
|
248 |
|
249 |
+
.file-uploader:hover {
|
250 |
+
border-color: #1E88E5;
|
251 |
}
|
252 |
|
253 |
+
/* Example documents styling */
|
254 |
+
.example-documents {
|
255 |
+
margin-top: 20px;
|
|
|
|
|
|
|
|
|
|
|
256 |
}
|
257 |
|
258 |
+
.example-card {
|
259 |
+
background-color: #f8f9fa;
|
260 |
+
border-radius: 8px;
|
261 |
+
padding: 15px;
|
262 |
+
margin-bottom: 15px;
|
263 |
+
border: 1px solid #e0e0e0;
|
264 |
+
cursor: pointer;
|
265 |
+
transition: all 0.2s ease;
|
266 |
}
|
267 |
|
268 |
+
.example-card:hover {
|
269 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
270 |
+
border-color: #c0c0c0;
|
271 |
}
|
272 |
|
273 |
+
.example-title {
|
274 |
+
font-weight: bold;
|
275 |
+
font-size: 16px;
|
276 |
+
margin-bottom: 5px;
|
277 |
+
}
|
278 |
+
|
279 |
+
.example-description {
|
280 |
+
font-size: 14px;
|
281 |
+
color: #555;
|
282 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/layout.py
CHANGED
@@ -1,27 +1,217 @@
|
|
1 |
-
"""
|
2 |
-
UI layout components for the OCR application.
|
3 |
-
"""
|
4 |
-
|
5 |
-
import os
|
6 |
import streamlit as st
|
7 |
-
from pathlib import Path
|
8 |
|
9 |
def load_css():
|
10 |
-
"""Load custom CSS for the application
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
|
|
2 |
|
3 |
def load_css():
|
4 |
+
"""Load custom CSS for the application"""
|
5 |
+
st.markdown("""
|
6 |
+
<style>
|
7 |
+
/* Global styles */
|
8 |
+
body {
|
9 |
+
font-family: 'Source Sans Pro', sans-serif;
|
10 |
+
color: #333;
|
11 |
+
}
|
12 |
+
|
13 |
+
/* Header styles */
|
14 |
+
h1, h2, h3, h4, h5, h6 {
|
15 |
+
font-family: 'Georgia', serif;
|
16 |
+
font-weight: 600;
|
17 |
+
color: #1E3A8A;
|
18 |
+
}
|
19 |
+
|
20 |
+
/* Processing status container */
|
21 |
+
.processing-status-container {
|
22 |
+
padding: 10px 15px;
|
23 |
+
border-left: 4px solid #1E88E5;
|
24 |
+
background-color: #E3F2FD;
|
25 |
+
border-radius: 0 4px 4px 0;
|
26 |
+
margin: 10px 0;
|
27 |
+
font-size: 14px;
|
28 |
+
}
|
29 |
+
|
30 |
+
/* Previous results styling */
|
31 |
+
.previous-results-container {
|
32 |
+
margin-top: 20px;
|
33 |
+
}
|
34 |
+
|
35 |
+
.result-card {
|
36 |
+
background-color: #f8f9fa;
|
37 |
+
border-radius: 8px;
|
38 |
+
padding: 15px;
|
39 |
+
margin-bottom: 15px;
|
40 |
+
border: 1px solid #e0e0e0;
|
41 |
+
transition: all 0.2s ease;
|
42 |
+
}
|
43 |
+
|
44 |
+
.result-card:hover {
|
45 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
46 |
+
border-color: #c0c0c0;
|
47 |
+
}
|
48 |
+
|
49 |
+
.result-header {
|
50 |
+
display: flex;
|
51 |
+
justify-content: space-between;
|
52 |
+
margin-bottom: 10px;
|
53 |
+
}
|
54 |
+
|
55 |
+
.result-filename {
|
56 |
+
font-weight: bold;
|
57 |
+
font-size: 16px;
|
58 |
+
}
|
59 |
+
|
60 |
+
.result-date {
|
61 |
+
color: #666;
|
62 |
+
font-size: 14px;
|
63 |
+
}
|
64 |
+
|
65 |
+
.result-metadata {
|
66 |
+
margin-top: 10px;
|
67 |
+
font-size: 14px;
|
68 |
+
}
|
69 |
+
|
70 |
+
.result-tag {
|
71 |
+
margin-bottom: 5px;
|
72 |
+
color: #555;
|
73 |
+
}
|
74 |
+
|
75 |
+
.result-action-button {
|
76 |
+
margin-top: 10px;
|
77 |
+
text-align: right;
|
78 |
+
}
|
79 |
+
|
80 |
+
.selected-result-container {
|
81 |
+
margin-top: 30px;
|
82 |
+
padding: 20px;
|
83 |
+
background-color: #f0f2f6;
|
84 |
+
border-radius: 8px;
|
85 |
+
border: 1px solid #d0d7de;
|
86 |
+
}
|
87 |
+
|
88 |
+
.selected-result-title {
|
89 |
+
font-size: 18px;
|
90 |
+
font-weight: bold;
|
91 |
+
color: #1E3A8A;
|
92 |
+
}
|
93 |
+
|
94 |
+
/* Raw text editor styling */
|
95 |
+
.stTextArea textarea {
|
96 |
+
font-family: 'Courier New', monospace;
|
97 |
+
font-size: 14px;
|
98 |
+
line-height: 1.5;
|
99 |
+
}
|
100 |
+
|
101 |
+
/* Image and text side-by-side styling */
|
102 |
+
.image-text-container {
|
103 |
+
display: flex;
|
104 |
+
gap: 20px;
|
105 |
+
margin-bottom: 20px;
|
106 |
+
}
|
107 |
+
|
108 |
+
.image-container {
|
109 |
+
flex: 1;
|
110 |
+
}
|
111 |
+
|
112 |
+
.text-container {
|
113 |
+
flex: 1;
|
114 |
+
}
|
115 |
+
|
116 |
+
/* Sidebar styling */
|
117 |
+
.sidebar .stRadio > div {
|
118 |
+
flex-direction: row;
|
119 |
+
}
|
120 |
+
|
121 |
+
.sidebar .stRadio label {
|
122 |
+
margin-right: 10px;
|
123 |
+
}
|
124 |
+
|
125 |
+
/* Optimize spacing in sidebar */
|
126 |
+
.sidebar .block-container {
|
127 |
+
padding-top: 0;
|
128 |
+
}
|
129 |
+
|
130 |
+
.sidebar [data-testid="stVerticalBlock"] {
|
131 |
+
gap: 0;
|
132 |
+
}
|
133 |
+
|
134 |
+
/* Button styling */
|
135 |
+
.stButton > button {
|
136 |
+
border-radius: 4px;
|
137 |
+
font-weight: 600;
|
138 |
+
}
|
139 |
+
|
140 |
+
/* File uploader styling */
|
141 |
+
.stFileUploader > section > div {
|
142 |
+
min-height: 100px;
|
143 |
+
}
|
144 |
+
|
145 |
+
/* Reset vertical text in file uploader */
|
146 |
+
.stFileUploader p,
|
147 |
+
.stFileUploader span,
|
148 |
+
.stFileUploader div p,
|
149 |
+
.stFileUploader div span,
|
150 |
+
.stFileUploader label p,
|
151 |
+
.stFileUploader label span,
|
152 |
+
.stFileUploader div[data-testid="stFileUploadDropzone"] p,
|
153 |
+
.stFileUploader div[data-testid="stFileUploadDropzone"] span {
|
154 |
+
writing-mode: horizontal-tb !important;
|
155 |
+
}
|
156 |
+
|
157 |
+
/* Metadata styling */
|
158 |
+
.metadata-card {
|
159 |
+
background-color: #f8f9fa;
|
160 |
+
border-radius: 8px;
|
161 |
+
padding: 15px;
|
162 |
+
margin-bottom: 20px;
|
163 |
+
border: 1px solid #e0e0e0;
|
164 |
+
}
|
165 |
+
|
166 |
+
/* Document content styling */
|
167 |
+
.document-content {
|
168 |
+
margin-top: 10px;
|
169 |
+
}
|
170 |
+
|
171 |
+
/* Tab styling */
|
172 |
+
.stTabs [data-baseweb="tab-list"] {
|
173 |
+
gap: 8px;
|
174 |
+
}
|
175 |
+
|
176 |
+
.stTabs [data-baseweb="tab"] {
|
177 |
+
padding: 8px 16px;
|
178 |
+
border-radius: 4px 4px 0 0;
|
179 |
+
}
|
180 |
|
181 |
+
/* Success message styling */
|
182 |
+
.stSuccess {
|
183 |
+
background-color: #D4EDDA;
|
184 |
+
color: #155724;
|
185 |
+
padding: 10px;
|
186 |
+
border-radius: 4px;
|
187 |
+
border-left: 5px solid #155724;
|
188 |
+
}
|
189 |
|
190 |
+
/* Error message styling */
|
191 |
+
.stError {
|
192 |
+
background-color: #F8D7DA;
|
193 |
+
color: #721C24;
|
194 |
+
padding: 10px;
|
195 |
+
border-radius: 4px;
|
196 |
+
border-left: 5px solid #721C24;
|
197 |
+
}
|
198 |
|
199 |
+
/* Info message styling */
|
200 |
+
.stInfo {
|
201 |
+
background-color: #D1ECF1;
|
202 |
+
color: #0C5460;
|
203 |
+
padding: 10px;
|
204 |
+
border-radius: 4px;
|
205 |
+
border-left: 5px solid #0C5460;
|
206 |
+
}
|
207 |
|
208 |
+
/* Warning message styling */
|
209 |
+
.stWarning {
|
210 |
+
background-color: #FFF3CD;
|
211 |
+
color: #856404;
|
212 |
+
padding: 10px;
|
213 |
+
border-radius: 4px;
|
214 |
+
border-left: 5px solid #856404;
|
215 |
+
}
|
216 |
+
</style>
|
217 |
+
""", unsafe_allow_html=True)
|
ui_components.py
ADDED
@@ -0,0 +1,774 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import io
|
4 |
+
import base64
|
5 |
+
from datetime import datetime
|
6 |
+
from pathlib import Path
|
7 |
+
import json
|
8 |
+
from constants import (
|
9 |
+
DOCUMENT_TYPES,
|
10 |
+
DOCUMENT_LAYOUTS,
|
11 |
+
CUSTOM_PROMPT_TEMPLATES,
|
12 |
+
LAYOUT_PROMPT_ADDITIONS,
|
13 |
+
DEFAULT_PDF_DPI,
|
14 |
+
MIN_PDF_DPI,
|
15 |
+
MAX_PDF_DPI,
|
16 |
+
DEFAULT_MAX_PAGES,
|
17 |
+
PERFORMANCE_MODES,
|
18 |
+
PREPROCESSING_DOC_TYPES,
|
19 |
+
ROTATION_OPTIONS
|
20 |
+
)
|
21 |
+
from utils import get_base64_from_image, extract_subject_tags
|
22 |
+
|
23 |
+
class ProgressReporter:
|
24 |
+
"""Class to handle progress reporting in the UI"""
|
25 |
+
|
26 |
+
def __init__(self, placeholder):
|
27 |
+
self.placeholder = placeholder
|
28 |
+
self.progress_bar = None
|
29 |
+
self.status_text = None
|
30 |
+
|
31 |
+
def setup(self):
|
32 |
+
"""Setup the progress components"""
|
33 |
+
with self.placeholder.container():
|
34 |
+
self.progress_bar = st.progress(0)
|
35 |
+
self.status_text = st.empty()
|
36 |
+
return self
|
37 |
+
|
38 |
+
def update(self, percent, status_text):
|
39 |
+
"""Update the progress bar and status text"""
|
40 |
+
if self.progress_bar is not None:
|
41 |
+
self.progress_bar.progress(percent / 100)
|
42 |
+
if self.status_text is not None:
|
43 |
+
self.status_text.text(status_text)
|
44 |
+
|
45 |
+
def complete(self, success=True):
|
46 |
+
"""Complete the progress reporting"""
|
47 |
+
if success:
|
48 |
+
if self.progress_bar is not None:
|
49 |
+
self.progress_bar.progress(100)
|
50 |
+
if self.status_text is not None:
|
51 |
+
self.status_text.text("Processing complete!")
|
52 |
+
else:
|
53 |
+
if self.status_text is not None:
|
54 |
+
self.status_text.text("Processing failed.")
|
55 |
+
|
56 |
+
# Clear the progress components after a delay
|
57 |
+
import time
|
58 |
+
time.sleep(0.8) # Short delay to show completion
|
59 |
+
if self.progress_bar is not None:
|
60 |
+
self.progress_bar.empty()
|
61 |
+
if self.status_text is not None:
|
62 |
+
self.status_text.empty()
|
63 |
+
|
64 |
+
def create_sidebar_options():
|
65 |
+
"""Create and return sidebar options"""
|
66 |
+
with st.sidebar:
|
67 |
+
st.title("OCR Settings")
|
68 |
+
|
69 |
+
# Create a container for the sidebar options
|
70 |
+
with st.container():
|
71 |
+
# Model selection
|
72 |
+
st.subheader("Model Selection")
|
73 |
+
use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
|
74 |
+
|
75 |
+
# Performance mode
|
76 |
+
perf_mode = st.radio("Performance Mode", PERFORMANCE_MODES,
|
77 |
+
horizontal=True,
|
78 |
+
help="Quality: Best results but slower. Speed: Faster but may be less accurate.")
|
79 |
+
|
80 |
+
# Document type selection
|
81 |
+
st.subheader("Document Type")
|
82 |
+
doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
|
83 |
+
help="Select the type of document you're processing for better results")
|
84 |
+
|
85 |
+
# Document layout
|
86 |
+
doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS,
|
87 |
+
help="Select the layout of your document")
|
88 |
+
|
89 |
+
# Custom prompt
|
90 |
+
custom_prompt = ""
|
91 |
+
if doc_type != DOCUMENT_TYPES[0]: # Not auto-detect
|
92 |
+
# Get the template for the selected document type
|
93 |
+
prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "")
|
94 |
+
|
95 |
+
# Add layout information if not standard
|
96 |
+
if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout
|
97 |
+
layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "")
|
98 |
+
if layout_addition:
|
99 |
+
prompt_template += " " + layout_addition
|
100 |
+
|
101 |
+
# Set the custom prompt
|
102 |
+
custom_prompt = prompt_template
|
103 |
+
|
104 |
+
# Allow user to edit the prompt
|
105 |
+
st.markdown("**Custom Processing Instructions**")
|
106 |
+
custom_prompt = st.text_area("", value=custom_prompt,
|
107 |
+
help="Customize the instructions for processing this document",
|
108 |
+
height=100)
|
109 |
+
|
110 |
+
# Image preprocessing options
|
111 |
+
st.subheader("Image Preprocessing")
|
112 |
+
|
113 |
+
# Document type for preprocessing
|
114 |
+
preprocessing_doc_type = st.radio("Document Type",
|
115 |
+
PREPROCESSING_DOC_TYPES,
|
116 |
+
horizontal=True,
|
117 |
+
help="Select the type of document for preprocessing")
|
118 |
+
|
119 |
+
# Grayscale conversion
|
120 |
+
grayscale = st.checkbox("Convert to Grayscale",
|
121 |
+
value=False,
|
122 |
+
help="Convert color images to grayscale for better OCR")
|
123 |
+
|
124 |
+
# Denoise
|
125 |
+
denoise = st.checkbox("Denoise Image",
|
126 |
+
value=False,
|
127 |
+
help="Remove noise from the image")
|
128 |
+
|
129 |
+
# Contrast adjustment
|
130 |
+
contrast = st.slider("Contrast Adjustment",
|
131 |
+
min_value=-50,
|
132 |
+
max_value=50,
|
133 |
+
value=0,
|
134 |
+
step=10,
|
135 |
+
help="Adjust image contrast")
|
136 |
+
|
137 |
+
# Rotation
|
138 |
+
rotation = st.slider("Rotation",
|
139 |
+
min_value=-45,
|
140 |
+
max_value=45,
|
141 |
+
value=0,
|
142 |
+
step=5,
|
143 |
+
help="Rotate image if needed")
|
144 |
+
|
145 |
+
# Create preprocessing options dictionary
|
146 |
+
preprocessing_options = {
|
147 |
+
"document_type": preprocessing_doc_type,
|
148 |
+
"grayscale": grayscale,
|
149 |
+
"denoise": denoise,
|
150 |
+
"contrast": contrast,
|
151 |
+
"rotation": rotation
|
152 |
+
}
|
153 |
+
|
154 |
+
# PDF-specific options
|
155 |
+
st.subheader("PDF Options")
|
156 |
+
pdf_dpi = st.slider("PDF Resolution (DPI)",
|
157 |
+
min_value=MIN_PDF_DPI,
|
158 |
+
max_value=MAX_PDF_DPI,
|
159 |
+
value=DEFAULT_PDF_DPI,
|
160 |
+
step=25,
|
161 |
+
help="Higher DPI gives better quality but slower processing")
|
162 |
+
|
163 |
+
max_pages = st.number_input("Maximum Pages to Process",
|
164 |
+
min_value=1,
|
165 |
+
max_value=20,
|
166 |
+
value=DEFAULT_MAX_PAGES,
|
167 |
+
help="Limit the number of pages to process (for multi-page PDFs)")
|
168 |
+
|
169 |
+
pdf_rotation = st.radio("PDF Rotation", ROTATION_OPTIONS,
|
170 |
+
horizontal=True,
|
171 |
+
format_func=lambda x: f"{x}°",
|
172 |
+
help="Rotate PDF pages if needed")
|
173 |
+
|
174 |
+
# Create options dictionary
|
175 |
+
options = {
|
176 |
+
"use_vision": use_vision,
|
177 |
+
"perf_mode": perf_mode,
|
178 |
+
"pdf_dpi": pdf_dpi,
|
179 |
+
"max_pages": max_pages,
|
180 |
+
"pdf_rotation": pdf_rotation,
|
181 |
+
"custom_prompt": custom_prompt,
|
182 |
+
"preprocessing_options": preprocessing_options
|
183 |
+
}
|
184 |
+
|
185 |
+
return options
|
186 |
+
|
187 |
+
def create_file_uploader():
|
188 |
+
"""Create and return a file uploader"""
|
189 |
+
# Add app description
|
190 |
+
favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
|
191 |
+
favicon_base64 = get_base64_from_image(favicon_path)
|
192 |
+
st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><img src="data:image/png;base64,{favicon_base64}" width="36" height="36" alt="Scroll Icon"/> <div><h1 style="margin: 0; padding: 20px 0 0 0;">Historical Document OCR</h1></div></div>', unsafe_allow_html=True)
|
193 |
+
st.subheader("Made possible by Mistral AI")
|
194 |
+
|
195 |
+
# Add project framing
|
196 |
+
st.markdown("""
|
197 |
+
This tool is designed to assist scholars in historical research by extracting text from challenging documents.
|
198 |
+
While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
|
199 |
+
historical documents, particularly:
|
200 |
+
|
201 |
+
- **Historical newspapers** with complex layouts and aged text
|
202 |
+
- **Handwritten documents** from various time periods
|
203 |
+
- **Photos of archival materials** that may be difficult to read
|
204 |
+
|
205 |
+
Upload a document to get started, or explore the example documents.
|
206 |
+
""")
|
207 |
+
|
208 |
+
# Create file uploader
|
209 |
+
uploaded_file = st.file_uploader(
|
210 |
+
"Upload a document",
|
211 |
+
type=["pdf", "png", "jpg", "jpeg"],
|
212 |
+
help="Upload a PDF or image file for OCR processing"
|
213 |
+
)
|
214 |
+
return uploaded_file
|
215 |
+
|
216 |
+
def display_results(result, container, custom_prompt=""):
|
217 |
+
"""Display OCR results in the provided container"""
|
218 |
+
with container:
|
219 |
+
# Display document metadata
|
220 |
+
st.subheader("Document Metadata")
|
221 |
+
|
222 |
+
# Create columns for metadata
|
223 |
+
meta_col1, meta_col2 = st.columns(2)
|
224 |
+
|
225 |
+
with meta_col1:
|
226 |
+
# Display document type and languages
|
227 |
+
if 'detected_document_type' in result:
|
228 |
+
st.write(f"**Document Type:** {result['detected_document_type']}")
|
229 |
+
|
230 |
+
if 'languages' in result:
|
231 |
+
languages = [lang for lang in result['languages'] if lang is not None]
|
232 |
+
if languages:
|
233 |
+
st.write(f"**Languages:** {', '.join(languages)}")
|
234 |
+
|
235 |
+
with meta_col2:
|
236 |
+
# Display processing time
|
237 |
+
if 'processing_time' in result:
|
238 |
+
st.write(f"**Processing Time:** {result['processing_time']:.1f}s")
|
239 |
+
|
240 |
+
# Display page information for PDFs
|
241 |
+
if 'limited_pages' in result:
|
242 |
+
st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
|
243 |
+
|
244 |
+
# Display subject tags if available
|
245 |
+
if 'topics' in result and result['topics']:
|
246 |
+
st.write("**Subject Tags:**")
|
247 |
+
# Create a container with flex display for the tags
|
248 |
+
st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
|
249 |
+
|
250 |
+
# Generate a badge for each tag
|
251 |
+
for topic in result['topics']:
|
252 |
+
# Create colored badge based on tag category
|
253 |
+
badge_color = "#546e7a" # Default color
|
254 |
+
|
255 |
+
# Assign colors by category
|
256 |
+
if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
|
257 |
+
badge_color = "#1565c0" # Blue for time periods
|
258 |
+
elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
|
259 |
+
badge_color = "#00695c" # Teal for languages
|
260 |
+
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
|
261 |
+
badge_color = "#6a1b9a" # Purple for document types
|
262 |
+
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
263 |
+
badge_color = "#2e7d32" # Green for subject domains
|
264 |
+
elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
|
265 |
+
badge_color = "#e65100" # Orange for preprocessing-related tags
|
266 |
+
|
267 |
+
st.markdown(
|
268 |
+
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
|
269 |
+
f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
|
270 |
+
unsafe_allow_html=True
|
271 |
+
)
|
272 |
+
|
273 |
+
# Close the container
|
274 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
275 |
+
|
276 |
+
# Display OCR content
|
277 |
+
st.subheader("OCR Content")
|
278 |
+
|
279 |
+
# Check if we have OCR content
|
280 |
+
if 'ocr_contents' in result:
|
281 |
+
# Create tabs for different views
|
282 |
+
has_images = result.get('has_images', False)
|
283 |
+
if has_images:
|
284 |
+
content_tab1, content_tab2, content_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
|
285 |
+
else:
|
286 |
+
content_tab1, content_tab2 = st.tabs(["Structured View", "Raw Text"])
|
287 |
+
|
288 |
+
with content_tab1:
|
289 |
+
# Display structured content
|
290 |
+
if isinstance(result['ocr_contents'], dict):
|
291 |
+
for section, content in result['ocr_contents'].items():
|
292 |
+
if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections
|
293 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
294 |
+
|
295 |
+
if isinstance(content, str):
|
296 |
+
st.write(content)
|
297 |
+
elif isinstance(content, list):
|
298 |
+
for item in content:
|
299 |
+
if isinstance(item, str):
|
300 |
+
st.write(f"- {item}")
|
301 |
+
else:
|
302 |
+
st.write(f"- {str(item)}")
|
303 |
+
elif isinstance(content, dict):
|
304 |
+
for k, v in content.items():
|
305 |
+
st.write(f"**{k}:** {v}")
|
306 |
+
|
307 |
+
with content_tab2:
|
308 |
+
# Display raw text with editing capability
|
309 |
+
raw_text = ""
|
310 |
+
if 'raw_text' in result['ocr_contents']:
|
311 |
+
raw_text = result['ocr_contents']['raw_text']
|
312 |
+
elif 'content' in result['ocr_contents']:
|
313 |
+
raw_text = result['ocr_contents']['content']
|
314 |
+
|
315 |
+
# Allow editing of the raw text
|
316 |
+
edited_text = st.text_area("Edit Raw Text", raw_text, height=400)
|
317 |
+
|
318 |
+
# Add a button to copy the edited text to clipboard
|
319 |
+
if st.button("Copy to Clipboard"):
|
320 |
+
st.success("Text copied to clipboard! (You can paste it elsewhere)")
|
321 |
+
# Note: The actual clipboard functionality is handled by the browser
|
322 |
+
|
323 |
+
# Add a download button for the edited text
|
324 |
+
st.download_button(
|
325 |
+
label="Download Edited Text",
|
326 |
+
data=edited_text,
|
327 |
+
file_name=f"{result.get('file_name', 'document').split('.')[0]}_edited.txt",
|
328 |
+
mime="text/plain"
|
329 |
+
)
|
330 |
+
|
331 |
+
if has_images and 'pages_data' in result:
|
332 |
+
with content_tab3:
|
333 |
+
# Use the display_document_with_images function
|
334 |
+
display_document_with_images(result)
|
335 |
+
|
336 |
+
# Display custom prompt if provided
|
337 |
+
if custom_prompt:
|
338 |
+
with st.expander("Custom Processing Instructions"):
|
339 |
+
st.write(custom_prompt)
|
340 |
+
|
341 |
+
# Add download buttons
|
342 |
+
st.subheader("Download Results")
|
343 |
+
|
344 |
+
# Create columns for download buttons
|
345 |
+
download_col1, download_col2 = st.columns(2)
|
346 |
+
|
347 |
+
with download_col1:
|
348 |
+
# JSON download
|
349 |
+
try:
|
350 |
+
json_str = json.dumps(result, indent=2)
|
351 |
+
st.download_button(
|
352 |
+
label="Download JSON",
|
353 |
+
data=json_str,
|
354 |
+
file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.json",
|
355 |
+
mime="application/json"
|
356 |
+
)
|
357 |
+
except Exception as e:
|
358 |
+
st.error(f"Error creating JSON download: {str(e)}")
|
359 |
+
|
360 |
+
with download_col2:
|
361 |
+
# Text download
|
362 |
+
try:
|
363 |
+
if 'ocr_contents' in result:
|
364 |
+
if 'raw_text' in result['ocr_contents']:
|
365 |
+
text_content = result['ocr_contents']['raw_text']
|
366 |
+
elif 'content' in result['ocr_contents']:
|
367 |
+
text_content = result['ocr_contents']['content']
|
368 |
+
else:
|
369 |
+
text_content = str(result['ocr_contents'])
|
370 |
+
else:
|
371 |
+
text_content = "No text content available."
|
372 |
+
|
373 |
+
st.download_button(
|
374 |
+
label="Download Text",
|
375 |
+
data=text_content,
|
376 |
+
file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.txt",
|
377 |
+
mime="text/plain"
|
378 |
+
)
|
379 |
+
except Exception as e:
|
380 |
+
st.error(f"Error creating text download: {str(e)}")
|
381 |
+
|
382 |
+
def display_document_with_images(result):
|
383 |
+
"""Display document with images"""
|
384 |
+
if 'pages_data' not in result:
|
385 |
+
st.info("No image data available.")
|
386 |
+
return
|
387 |
+
|
388 |
+
# Display each page
|
389 |
+
for i, page_data in enumerate(result['pages_data']):
|
390 |
+
st.markdown(f"### Page {i+1}")
|
391 |
+
|
392 |
+
# Create columns for image and text
|
393 |
+
img_col, text_col = st.columns([1, 1])
|
394 |
+
|
395 |
+
with img_col:
|
396 |
+
# Display the image
|
397 |
+
if 'image_data' in page_data:
|
398 |
+
try:
|
399 |
+
# Convert base64 to image
|
400 |
+
image_data = base64.b64decode(page_data['image_data'])
|
401 |
+
st.image(io.BytesIO(image_data), use_column_width=True)
|
402 |
+
except Exception as e:
|
403 |
+
st.error(f"Error displaying image: {str(e)}")
|
404 |
+
else:
|
405 |
+
st.info("No image available for this page.")
|
406 |
+
|
407 |
+
with text_col:
|
408 |
+
# Display the text with editing capability
|
409 |
+
if 'text' in page_data:
|
410 |
+
edited_text = st.text_area(f"Page {i+1} Text", page_data['text'], height=300, key=f"page_text_{i}")
|
411 |
+
|
412 |
+
# Add a button to copy the edited text to clipboard
|
413 |
+
if st.button(f"Copy Page {i+1} Text", key=f"copy_btn_{i}"):
|
414 |
+
st.success(f"Page {i+1} text copied to clipboard!")
|
415 |
+
else:
|
416 |
+
st.info("No text available for this page.")
|
417 |
+
|
418 |
+
def display_previous_results():
|
419 |
+
"""Display previous results tab content"""
|
420 |
+
st.markdown('<h2>Previous Results</h2>', unsafe_allow_html=True)
|
421 |
+
|
422 |
+
# Load custom CSS for Previous Results tab
|
423 |
+
try:
|
424 |
+
from ui.layout import load_css
|
425 |
+
load_css()
|
426 |
+
except ImportError:
|
427 |
+
# If ui.layout module is not available, use a simplified version
|
428 |
+
st.markdown("""
|
429 |
+
<style>
|
430 |
+
.previous-results-container {
|
431 |
+
margin-top: 20px;
|
432 |
+
}
|
433 |
+
.result-card {
|
434 |
+
background-color: #f8f9fa;
|
435 |
+
border-radius: 8px;
|
436 |
+
padding: 15px;
|
437 |
+
margin-bottom: 15px;
|
438 |
+
border: 1px solid #e0e0e0;
|
439 |
+
}
|
440 |
+
.result-header {
|
441 |
+
display: flex;
|
442 |
+
justify-content: space-between;
|
443 |
+
margin-bottom: 10px;
|
444 |
+
}
|
445 |
+
.result-filename {
|
446 |
+
font-weight: bold;
|
447 |
+
font-size: 16px;
|
448 |
+
}
|
449 |
+
.result-date {
|
450 |
+
color: #666;
|
451 |
+
font-size: 14px;
|
452 |
+
}
|
453 |
+
.result-metadata {
|
454 |
+
margin-top: 10px;
|
455 |
+
font-size: 14px;
|
456 |
+
}
|
457 |
+
.result-tag {
|
458 |
+
margin-bottom: 5px;
|
459 |
+
color: #555;
|
460 |
+
}
|
461 |
+
.result-action-button {
|
462 |
+
margin-top: 10px;
|
463 |
+
text-align: right;
|
464 |
+
}
|
465 |
+
.selected-result-container {
|
466 |
+
margin-top: 30px;
|
467 |
+
padding: 20px;
|
468 |
+
background-color: #f0f2f6;
|
469 |
+
border-radius: 8px;
|
470 |
+
}
|
471 |
+
.selected-result-title {
|
472 |
+
font-size: 18px;
|
473 |
+
font-weight: bold;
|
474 |
+
}
|
475 |
+
</style>
|
476 |
+
""", unsafe_allow_html=True)
|
477 |
+
|
478 |
+
# Display previous results if available
|
479 |
+
if not st.session_state.previous_results:
|
480 |
+
st.markdown("""
|
481 |
+
<div class="previous-results-container" style="text-align: center; padding: 40px 20px; background-color: #f0f2f6; border-radius: 8px;">
|
482 |
+
<div style="font-size: 48px; margin-bottom: 20px;">📄</div>
|
483 |
+
<h3 style="margin-bottom: 10px; font-weight: 600;">No Previous Results</h3>
|
484 |
+
<p style="font-size: 16px;">Process a document to see your results history saved here.</p>
|
485 |
+
</div>
|
486 |
+
""", unsafe_allow_html=True)
|
487 |
+
else:
|
488 |
+
# Create a container for the results list
|
489 |
+
st.markdown('<div class="previous-results-container">', unsafe_allow_html=True)
|
490 |
+
st.markdown(f'<h3>{len(st.session_state.previous_results)} Previous Results</h3>', unsafe_allow_html=True)
|
491 |
+
|
492 |
+
# Create two columns for filters and download buttons
|
493 |
+
filter_col, download_col = st.columns([2, 1])
|
494 |
+
|
495 |
+
with filter_col:
|
496 |
+
# Add filter options
|
497 |
+
filter_options = ["All Types"]
|
498 |
+
if any(result.get("file_name", "").lower().endswith(".pdf") for result in st.session_state.previous_results):
|
499 |
+
filter_options.append("PDF Documents")
|
500 |
+
if any(result.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png")) for result in st.session_state.previous_results):
|
501 |
+
filter_options.append("Images")
|
502 |
+
|
503 |
+
selected_filter = st.selectbox("Filter by Type:", filter_options)
|
504 |
+
|
505 |
+
with download_col:
|
506 |
+
# Add download all button for results
|
507 |
+
if len(st.session_state.previous_results) > 0:
|
508 |
+
try:
|
509 |
+
# Create buffer in memory instead of file on disk
|
510 |
+
import io
|
511 |
+
from ocr_utils import create_results_zip_in_memory
|
512 |
+
|
513 |
+
# Get zip data directly in memory
|
514 |
+
zip_data = create_results_zip_in_memory(st.session_state.previous_results)
|
515 |
+
|
516 |
+
# Create more informative ZIP filename with timestamp
|
517 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
518 |
+
|
519 |
+
# Count document types for a more descriptive filename
|
520 |
+
pdf_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith('.pdf'))
|
521 |
+
img_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
|
522 |
+
|
523 |
+
# Create more descriptive filename
|
524 |
+
if pdf_count > 0 and img_count > 0:
|
525 |
+
zip_filename = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
|
526 |
+
elif pdf_count > 0:
|
527 |
+
zip_filename = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
|
528 |
+
elif img_count > 0:
|
529 |
+
zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip"
|
530 |
+
else:
|
531 |
+
zip_filename = f"historical_ocr_results_{timestamp}.zip"
|
532 |
+
|
533 |
+
st.download_button(
|
534 |
+
label="Download All Results",
|
535 |
+
data=zip_data,
|
536 |
+
file_name=zip_filename,
|
537 |
+
mime="application/zip",
|
538 |
+
help="Download all previous results as a ZIP file containing HTML and JSON files"
|
539 |
+
)
|
540 |
+
except Exception as e:
|
541 |
+
st.error(f"Error creating download: {str(e)}")
|
542 |
+
st.info("Try with fewer results or individual downloads")
|
543 |
+
|
544 |
+
# Filter results based on selection
|
545 |
+
filtered_results = st.session_state.previous_results
|
546 |
+
if selected_filter == "PDF Documents":
|
547 |
+
filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith(".pdf")]
|
548 |
+
elif selected_filter == "Images":
|
549 |
+
filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png"))]
|
550 |
+
|
551 |
+
# Show a message if no results match the filter
|
552 |
+
if not filtered_results:
|
553 |
+
st.markdown("""
|
554 |
+
<div style="text-align: center; padding: 20px; background-color: #f9f9f9; border-radius: 5px; margin: 20px 0;">
|
555 |
+
<p>No results match the selected filter.</p>
|
556 |
+
</div>
|
557 |
+
""", unsafe_allow_html=True)
|
558 |
+
|
559 |
+
# Display each result as a card
|
560 |
+
for i, result in enumerate(filtered_results):
|
561 |
+
# Determine file type icon
|
562 |
+
file_name = result.get("file_name", f"Document {i+1}")
|
563 |
+
file_type_lower = file_name.lower()
|
564 |
+
|
565 |
+
if file_type_lower.endswith(".pdf"):
|
566 |
+
icon = "📄"
|
567 |
+
elif file_type_lower.endswith((".jpg", ".jpeg", ".png", ".gif")):
|
568 |
+
icon = "🖼️"
|
569 |
+
else:
|
570 |
+
icon = "📝"
|
571 |
+
|
572 |
+
# Create a card for each result
|
573 |
+
st.markdown(f"""
|
574 |
+
<div class="result-card">
|
575 |
+
<div class="result-header">
|
576 |
+
<div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div>
|
577 |
+
<div class="result-date">{result.get('timestamp', 'Unknown')}</div>
|
578 |
+
</div>
|
579 |
+
<div class="result-metadata">
|
580 |
+
<div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
|
581 |
+
<div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div>
|
582 |
+
</div>
|
583 |
+
""", unsafe_allow_html=True)
|
584 |
+
|
585 |
+
# Add view button inside the card with proper styling
|
586 |
+
st.markdown('<div class="result-action-button">', unsafe_allow_html=True)
|
587 |
+
if st.button(f"View Document", key=f"view_{i}"):
|
588 |
+
# Set the selected result in the session state
|
589 |
+
st.session_state.selected_previous_result = st.session_state.previous_results[i]
|
590 |
+
# Force a rerun to show the selected result
|
591 |
+
st.rerun()
|
592 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
593 |
+
|
594 |
+
# Close the result card
|
595 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
596 |
+
|
597 |
+
# Close the container
|
598 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
599 |
+
|
600 |
+
# Display the selected result if available
|
601 |
+
if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
|
602 |
+
selected_result = st.session_state.selected_previous_result
|
603 |
+
|
604 |
+
# Create a styled container for the selected result
|
605 |
+
st.markdown(f"""
|
606 |
+
<div class="selected-result-container">
|
607 |
+
<div class="result-header" style="margin-bottom: 20px;">
|
608 |
+
<div class="selected-result-title">Selected Document: {selected_result.get('file_name', 'Unknown')}</div>
|
609 |
+
<div class="result-date">{selected_result.get('timestamp', '')}</div>
|
610 |
+
</div>
|
611 |
+
""", unsafe_allow_html=True)
|
612 |
+
|
613 |
+
# Display metadata in a styled way
|
614 |
+
meta_col1, meta_col2 = st.columns(2)
|
615 |
+
|
616 |
+
with meta_col1:
|
617 |
+
# Display document metadata
|
618 |
+
if 'languages' in selected_result:
|
619 |
+
languages = [lang for lang in selected_result['languages'] if lang is not None]
|
620 |
+
if languages:
|
621 |
+
st.write(f"**Languages:** {', '.join(languages)}")
|
622 |
+
|
623 |
+
if 'topics' in selected_result and selected_result['topics']:
|
624 |
+
# Show topics in a more organized way with badges
|
625 |
+
st.markdown("**Subject Tags:**")
|
626 |
+
# Create a container with flex display for the tags
|
627 |
+
st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
|
628 |
+
|
629 |
+
# Generate a badge for each tag
|
630 |
+
for topic in selected_result['topics']:
|
631 |
+
# Create colored badge based on tag category
|
632 |
+
badge_color = "#546e7a" # Default color
|
633 |
+
|
634 |
+
# Assign colors by category
|
635 |
+
if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
|
636 |
+
badge_color = "#1565c0" # Blue for time periods
|
637 |
+
elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
|
638 |
+
badge_color = "#00695c" # Teal for languages
|
639 |
+
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
|
640 |
+
badge_color = "#6a1b9a" # Purple for document types
|
641 |
+
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
642 |
+
badge_color = "#2e7d32" # Green for subject domains
|
643 |
+
elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
|
644 |
+
badge_color = "#e65100" # Orange for preprocessing-related tags
|
645 |
+
|
646 |
+
st.markdown(
|
647 |
+
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
|
648 |
+
f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
|
649 |
+
unsafe_allow_html=True
|
650 |
+
)
|
651 |
+
|
652 |
+
# Close the container
|
653 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
654 |
+
|
655 |
+
with meta_col2:
|
656 |
+
# Display processing metadata
|
657 |
+
if 'limited_pages' in selected_result:
|
658 |
+
st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages")
|
659 |
+
|
660 |
+
if 'processing_time' in selected_result:
|
661 |
+
proc_time = selected_result['processing_time']
|
662 |
+
st.write(f"**Processing Time:** {proc_time:.1f}s")
|
663 |
+
|
664 |
+
# Create tabs for content display
|
665 |
+
has_images = selected_result.get('has_images', False)
|
666 |
+
if has_images:
|
667 |
+
view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
|
668 |
+
else:
|
669 |
+
view_tab1, view_tab2 = st.tabs(["Structured View", "Raw Text"])
|
670 |
+
|
671 |
+
with view_tab1:
|
672 |
+
# Display structured content
|
673 |
+
if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
|
674 |
+
for section, content in selected_result['ocr_contents'].items():
|
675 |
+
if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections
|
676 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
677 |
+
|
678 |
+
if isinstance(content, str):
|
679 |
+
st.write(content)
|
680 |
+
elif isinstance(content, list):
|
681 |
+
for item in content:
|
682 |
+
if isinstance(item, str):
|
683 |
+
st.write(f"- {item}")
|
684 |
+
else:
|
685 |
+
st.write(f"- {str(item)}")
|
686 |
+
elif isinstance(content, dict):
|
687 |
+
for k, v in content.items():
|
688 |
+
st.write(f"**{k}:** {v}")
|
689 |
+
|
690 |
+
with view_tab2:
|
691 |
+
# Display raw text with editing capability
|
692 |
+
raw_text = ""
|
693 |
+
if 'ocr_contents' in selected_result:
|
694 |
+
if 'raw_text' in selected_result['ocr_contents']:
|
695 |
+
raw_text = selected_result['ocr_contents']['raw_text']
|
696 |
+
elif 'content' in selected_result['ocr_contents']:
|
697 |
+
raw_text = selected_result['ocr_contents']['content']
|
698 |
+
|
699 |
+
# Allow editing of the raw text
|
700 |
+
edited_text = st.text_area("Edit Raw Text", raw_text, height=400, key="selected_raw_text")
|
701 |
+
|
702 |
+
# Add a button to copy the edited text to clipboard
|
703 |
+
if st.button("Copy to Clipboard", key="selected_copy_btn"):
|
704 |
+
st.success("Text copied to clipboard! (You can paste it elsewhere)")
|
705 |
+
|
706 |
+
# Add a download button for the edited text
|
707 |
+
st.download_button(
|
708 |
+
label="Download Edited Text",
|
709 |
+
data=edited_text,
|
710 |
+
file_name=f"{selected_result.get('file_name', 'document').split('.')[0]}_edited.txt",
|
711 |
+
mime="text/plain",
|
712 |
+
key="selected_download_btn"
|
713 |
+
)
|
714 |
+
|
715 |
+
if has_images and 'pages_data' in selected_result:
|
716 |
+
with view_tab3:
|
717 |
+
# Use the display_document_with_images function
|
718 |
+
display_document_with_images(selected_result)
|
719 |
+
|
720 |
+
# Close the container
|
721 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
722 |
+
|
723 |
+
# Add a button to close the selected result
|
724 |
+
if st.button("Close Selected Document", key="close_selected"):
|
725 |
+
# Clear the selected result from session state
|
726 |
+
del st.session_state.selected_previous_result
|
727 |
+
# Force a rerun to update the view
|
728 |
+
st.rerun()
|
729 |
+
|
730 |
+
def display_about_tab():
|
731 |
+
"""Display about tab content"""
|
732 |
+
st.markdown('<h2>About Historical OCR</h2>', unsafe_allow_html=True)
|
733 |
+
|
734 |
+
# Add app description
|
735 |
+
st.markdown("""
|
736 |
+
**Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
|
737 |
+
|
738 |
+
### Purpose
|
739 |
+
|
740 |
+
This tool is designed to assist scholars in historical research by extracting text from challenging documents.
|
741 |
+
While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
|
742 |
+
historical documents, particularly:
|
743 |
+
|
744 |
+
- **Historical newspapers** with complex layouts and aged text
|
745 |
+
- **Handwritten documents** from various time periods
|
746 |
+
- **Photos of archival materials** that may be difficult to read
|
747 |
+
|
748 |
+
### Features
|
749 |
+
|
750 |
+
- **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
|
751 |
+
- **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
|
752 |
+
- **Editable Results**: Review and edit extracted text directly in the interface
|
753 |
+
- **Structured Content Analysis**: Automatic organization of document content
|
754 |
+
- **Multi-language Support**: Process documents in various languages
|
755 |
+
- **PDF Processing**: Handle multi-page historical documents
|
756 |
+
|
757 |
+
### How to Use
|
758 |
+
|
759 |
+
1. Upload a document (PDF or image)
|
760 |
+
2. Select the document type and adjust preprocessing options if needed
|
761 |
+
3. Add custom processing instructions for specialized documents
|
762 |
+
4. Process the document
|
763 |
+
5. Review, edit, and download the results
|
764 |
+
|
765 |
+
### Technologies
|
766 |
+
|
767 |
+
- OCR processing using Mistral AI's advanced document understanding capabilities
|
768 |
+
- Image preprocessing with OpenCV
|
769 |
+
- PDF handling with pdf2image
|
770 |
+
- Web interface with Streamlit
|
771 |
+
""")
|
772 |
+
|
773 |
+
# Add version information
|
774 |
+
st.markdown("**Version:** 1.0.0")
|
utils.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
import hashlib
|
4 |
+
import time
|
5 |
+
import logging
|
6 |
+
from datetime import datetime
|
7 |
+
from pathlib import Path
|
8 |
+
from functools import wraps
|
9 |
+
from constants import CONTENT_THEMES, PERIOD_TAGS, DEFAULT_TAGS, GENERIC_TAGS
|
10 |
+
|
11 |
+
# Configure logging
|
12 |
+
logger = logging.getLogger("utils")
|
13 |
+
logger.setLevel(logging.INFO)
|
14 |
+
|
15 |
+
def get_base64_from_image(image_path):
|
16 |
+
"""Get base64 string from image file"""
|
17 |
+
try:
|
18 |
+
with open(image_path, "rb") as img_file:
|
19 |
+
return base64.b64encode(img_file.read()).decode('utf-8')
|
20 |
+
except Exception as e:
|
21 |
+
logger.error(f"Error encoding image to base64: {str(e)}")
|
22 |
+
return ""
|
23 |
+
|
24 |
+
def timing(description):
|
25 |
+
"""Context manager for timing code execution"""
|
26 |
+
class TimingContext:
|
27 |
+
def __init__(self, description):
|
28 |
+
self.description = description
|
29 |
+
|
30 |
+
def __enter__(self):
|
31 |
+
self.start_time = time.time()
|
32 |
+
return self
|
33 |
+
|
34 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
35 |
+
end_time = time.time()
|
36 |
+
execution_time = end_time - self.start_time
|
37 |
+
logger.info(f"{self.description} took {execution_time:.2f} seconds")
|
38 |
+
return False
|
39 |
+
|
40 |
+
return TimingContext(description)
|
41 |
+
|
42 |
+
def format_timestamp(timestamp=None):
|
43 |
+
"""Format timestamp for display"""
|
44 |
+
if timestamp is None:
|
45 |
+
timestamp = datetime.now()
|
46 |
+
elif isinstance(timestamp, str):
|
47 |
+
try:
|
48 |
+
timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
|
49 |
+
except ValueError:
|
50 |
+
timestamp = datetime.now()
|
51 |
+
|
52 |
+
return timestamp.strftime("%Y-%m-%d %H:%M")
|
53 |
+
|
54 |
+
def generate_cache_key(file_bytes, file_type, use_vision, preprocessing_options=None, pdf_rotation=0, custom_prompt=None):
|
55 |
+
"""
|
56 |
+
Generate a cache key for OCR processing
|
57 |
+
|
58 |
+
Args:
|
59 |
+
file_bytes: File content as bytes
|
60 |
+
file_type: Type of file (pdf or image)
|
61 |
+
use_vision: Whether to use vision model
|
62 |
+
preprocessing_options: Dictionary of preprocessing options
|
63 |
+
pdf_rotation: PDF rotation value
|
64 |
+
custom_prompt: Custom prompt for OCR
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
str: Cache key
|
68 |
+
"""
|
69 |
+
# Generate file hash
|
70 |
+
file_hash = hashlib.md5(file_bytes).hexdigest()
|
71 |
+
|
72 |
+
# Include preprocessing options in cache key
|
73 |
+
preprocessing_options_hash = ""
|
74 |
+
if preprocessing_options:
|
75 |
+
# Add pdf_rotation to preprocessing options to ensure it's part of the cache key
|
76 |
+
if pdf_rotation != 0:
|
77 |
+
preprocessing_options_with_rotation = preprocessing_options.copy()
|
78 |
+
preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation
|
79 |
+
preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
|
80 |
+
else:
|
81 |
+
preprocessing_str = str(sorted(preprocessing_options.items()))
|
82 |
+
preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
|
83 |
+
elif pdf_rotation != 0:
|
84 |
+
# If no preprocessing options but we have rotation, include that in the hash
|
85 |
+
preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation}".encode()).hexdigest()
|
86 |
+
|
87 |
+
# Create base cache key
|
88 |
+
cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
|
89 |
+
|
90 |
+
# Include custom prompt in cache key if provided
|
91 |
+
if custom_prompt:
|
92 |
+
custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
|
93 |
+
cache_key = f"{cache_key}_{custom_prompt_hash}"
|
94 |
+
|
95 |
+
return cache_key
|
96 |
+
|
97 |
+
def handle_temp_files(temp_file_paths):
|
98 |
+
"""
|
99 |
+
Clean up temporary files
|
100 |
+
|
101 |
+
Args:
|
102 |
+
temp_file_paths: List of temporary file paths to clean up
|
103 |
+
"""
|
104 |
+
for temp_path in temp_file_paths:
|
105 |
+
try:
|
106 |
+
if os.path.exists(temp_path):
|
107 |
+
os.unlink(temp_path)
|
108 |
+
logger.info(f"Removed temporary file: {temp_path}")
|
109 |
+
except Exception as e:
|
110 |
+
logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
|
111 |
+
|
112 |
+
def create_descriptive_filename(original_filename, result, file_ext, preprocessing_options=None):
|
113 |
+
"""
|
114 |
+
Create a descriptive filename for the result
|
115 |
+
|
116 |
+
Args:
|
117 |
+
original_filename: Original filename
|
118 |
+
result: OCR result dictionary
|
119 |
+
file_ext: File extension
|
120 |
+
preprocessing_options: Dictionary of preprocessing options
|
121 |
+
|
122 |
+
Returns:
|
123 |
+
str: Descriptive filename
|
124 |
+
"""
|
125 |
+
# Get base name without extension
|
126 |
+
original_name = Path(original_filename).stem
|
127 |
+
|
128 |
+
# Add document type to filename if detected
|
129 |
+
doc_type_tag = ""
|
130 |
+
if 'detected_document_type' in result:
|
131 |
+
doc_type = result['detected_document_type'].lower()
|
132 |
+
doc_type_tag = f"_{doc_type.replace(' ', '_')}"
|
133 |
+
elif 'topics' in result and result['topics']:
|
134 |
+
# Use first tag as document type if not explicitly detected
|
135 |
+
doc_type_tag = f"_{result['topics'][0].lower().replace(' ', '_')}"
|
136 |
+
|
137 |
+
# Add period tag for historical context if available
|
138 |
+
period_tag = ""
|
139 |
+
if 'topics' in result and result['topics']:
|
140 |
+
for tag in result['topics']:
|
141 |
+
if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
|
142 |
+
period_tag = f"_{tag.lower().replace(' ', '_')}"
|
143 |
+
break
|
144 |
+
|
145 |
+
# Generate final descriptive filename
|
146 |
+
descriptive_name = f"{original_name}{doc_type_tag}{period_tag}{file_ext}"
|
147 |
+
return descriptive_name
|
148 |
+
|
149 |
+
def extract_subject_tags(result, raw_text, preprocessing_options=None):
|
150 |
+
"""
|
151 |
+
Extract subject tags from OCR result
|
152 |
+
|
153 |
+
Args:
|
154 |
+
result: OCR result dictionary
|
155 |
+
raw_text: Raw text from OCR
|
156 |
+
preprocessing_options: Dictionary of preprocessing options
|
157 |
+
|
158 |
+
Returns:
|
159 |
+
list: Subject tags
|
160 |
+
"""
|
161 |
+
subject_tags = []
|
162 |
+
|
163 |
+
try:
|
164 |
+
# Use existing topics as starting point if available
|
165 |
+
if 'topics' in result and result['topics']:
|
166 |
+
subject_tags = list(result['topics'])
|
167 |
+
|
168 |
+
# Add document type if detected
|
169 |
+
if 'detected_document_type' in result:
|
170 |
+
doc_type = result['detected_document_type'].capitalize()
|
171 |
+
if doc_type not in subject_tags:
|
172 |
+
subject_tags.append(doc_type)
|
173 |
+
|
174 |
+
# Analyze content for common themes based on keywords
|
175 |
+
if raw_text:
|
176 |
+
raw_text_lower = raw_text.lower()
|
177 |
+
for theme, keywords in CONTENT_THEMES.items():
|
178 |
+
if any(keyword in raw_text_lower for keyword in keywords):
|
179 |
+
if theme not in subject_tags:
|
180 |
+
subject_tags.append(theme)
|
181 |
+
|
182 |
+
# Add document period tag if date patterns are detected
|
183 |
+
if raw_text:
|
184 |
+
# Look for years in content
|
185 |
+
import re
|
186 |
+
year_matches = re.findall(r'\b1[0-9]{3}\b|\b20[0-1][0-9]\b', raw_text)
|
187 |
+
if year_matches:
|
188 |
+
# Convert to integers
|
189 |
+
years = [int(y) for y in year_matches]
|
190 |
+
# Get earliest year
|
191 |
+
earliest = min(years)
|
192 |
+
|
193 |
+
# Find the period tag for this year
|
194 |
+
for year_range, period_tag in PERIOD_TAGS.items():
|
195 |
+
if year_range[0] <= earliest <= year_range[1]:
|
196 |
+
if period_tag not in subject_tags:
|
197 |
+
subject_tags.append(period_tag)
|
198 |
+
break
|
199 |
+
|
200 |
+
# Add languages as topics if available
|
201 |
+
if 'languages' in result and result['languages']:
|
202 |
+
for lang in result['languages']:
|
203 |
+
if lang and lang not in subject_tags:
|
204 |
+
lang_tag = f"{lang} Language"
|
205 |
+
subject_tags.append(lang_tag)
|
206 |
+
|
207 |
+
# Add preprocessing information as tags if preprocessing was applied
|
208 |
+
if preprocessing_options:
|
209 |
+
preprocessing_methods = []
|
210 |
+
if preprocessing_options.get("document_type", "standard") != "standard":
|
211 |
+
doc_type = preprocessing_options["document_type"].capitalize()
|
212 |
+
preprocessing_tag = f"Enhanced ({doc_type})"
|
213 |
+
if preprocessing_tag not in subject_tags:
|
214 |
+
subject_tags.append(preprocessing_tag)
|
215 |
+
|
216 |
+
if preprocessing_options.get("grayscale", False):
|
217 |
+
preprocessing_methods.append("Grayscale")
|
218 |
+
if preprocessing_options.get("denoise", False):
|
219 |
+
preprocessing_methods.append("Denoised")
|
220 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
221 |
+
contrast_val = preprocessing_options.get("contrast", 0)
|
222 |
+
if contrast_val > 0:
|
223 |
+
preprocessing_methods.append("Contrast Enhanced")
|
224 |
+
else:
|
225 |
+
preprocessing_methods.append("Contrast Reduced")
|
226 |
+
if preprocessing_options.get("rotation", 0) != 0:
|
227 |
+
preprocessing_methods.append("Rotated")
|
228 |
+
|
229 |
+
# Add a combined preprocessing tag if methods were applied
|
230 |
+
if preprocessing_methods:
|
231 |
+
prep_tag = "Preprocessed"
|
232 |
+
if prep_tag not in subject_tags:
|
233 |
+
subject_tags.append(prep_tag)
|
234 |
+
|
235 |
+
# Add the specific method as a tag if only one was used
|
236 |
+
if len(preprocessing_methods) == 1:
|
237 |
+
method_tag = preprocessing_methods[0]
|
238 |
+
if method_tag not in subject_tags:
|
239 |
+
subject_tags.append(method_tag)
|
240 |
+
|
241 |
+
except Exception as e:
|
242 |
+
logger.warning(f"Error generating subject tags: {str(e)}")
|
243 |
+
# Fallback tags if extraction fails
|
244 |
+
if not subject_tags:
|
245 |
+
subject_tags = DEFAULT_TAGS.copy()
|
246 |
+
|
247 |
+
# Ensure we have at least 3 tags
|
248 |
+
while len(subject_tags) < 3:
|
249 |
+
for tag in DEFAULT_TAGS:
|
250 |
+
if tag not in subject_tags:
|
251 |
+
subject_tags.append(tag)
|
252 |
+
break
|
253 |
+
else:
|
254 |
+
# If all default tags are already used, add generic ones
|
255 |
+
for tag in GENERIC_TAGS:
|
256 |
+
if tag not in subject_tags:
|
257 |
+
subject_tags.append(tag)
|
258 |
+
break
|
259 |
+
else:
|
260 |
+
# If we still can't add any more tags, break the loop
|
261 |
+
break
|
262 |
+
|
263 |
+
return subject_tags
|