Spaces:
Running
Running
Reconcile main with preview-improvements branch, implementing modular structure, raw text editing, and enhanced framing
Browse files- app.py +0 -0
- constants.py +110 -0
- error_handler.py +65 -0
- ocr_processing.py +279 -0
- preprocessing.py +180 -0
- ui/custom.css +222 -335
- ui/layout.py +210 -20
- ui_components.py +774 -0
- utils.py +263 -0
app.py
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
constants.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Constants for the Historical OCR application.
|
| 3 |
+
|
| 4 |
+
This module contains all the constants used throughout the application,
|
| 5 |
+
making it easier to maintain and update values in one place.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# API limits
|
| 9 |
+
MAX_FILE_SIZE_MB = 50
|
| 10 |
+
MAX_PAGES = 20
|
| 11 |
+
|
| 12 |
+
# Caching
|
| 13 |
+
CACHE_TTL_SECONDS = 24 * 3600 # 24 hours
|
| 14 |
+
MAX_CACHE_ENTRIES = 20
|
| 15 |
+
|
| 16 |
+
# Image processing
|
| 17 |
+
MAX_IMAGE_DIMENSION = 2500
|
| 18 |
+
IMAGE_QUALITY = 92
|
| 19 |
+
|
| 20 |
+
# Document types
|
| 21 |
+
DOCUMENT_TYPES = [
|
| 22 |
+
"Auto-detect (standard processing)",
|
| 23 |
+
"Newspaper or Magazine",
|
| 24 |
+
"Letter or Correspondence",
|
| 25 |
+
"Book or Publication",
|
| 26 |
+
"Form or Legal Document",
|
| 27 |
+
"Recipe",
|
| 28 |
+
"Handwritten Document",
|
| 29 |
+
"Map or Illustration",
|
| 30 |
+
"Table or Spreadsheet",
|
| 31 |
+
"Other (specify in instructions)"
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
# Document layouts
|
| 35 |
+
DOCUMENT_LAYOUTS = [
|
| 36 |
+
"Standard layout",
|
| 37 |
+
"Multiple columns",
|
| 38 |
+
"Table/grid format",
|
| 39 |
+
"Mixed layout with images"
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
# Preprocessing document types
|
| 43 |
+
PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]
|
| 44 |
+
|
| 45 |
+
# Rotation options
|
| 46 |
+
ROTATION_OPTIONS = [0, 90, 180, 270]
|
| 47 |
+
|
| 48 |
+
# PDF settings
|
| 49 |
+
DEFAULT_PDF_DPI = 100
|
| 50 |
+
MIN_PDF_DPI = 72
|
| 51 |
+
MAX_PDF_DPI = 300
|
| 52 |
+
DEFAULT_MAX_PAGES = 3
|
| 53 |
+
|
| 54 |
+
# Performance modes
|
| 55 |
+
PERFORMANCE_MODES = ["Quality", "Speed"]
|
| 56 |
+
|
| 57 |
+
# Custom prompt templates
|
| 58 |
+
CUSTOM_PROMPT_TEMPLATES = {
|
| 59 |
+
"Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
|
| 60 |
+
"Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
|
| 61 |
+
"Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
|
| 62 |
+
"Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
|
| 63 |
+
"Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
|
| 64 |
+
"Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
|
| 65 |
+
"Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
|
| 66 |
+
"Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
|
| 67 |
+
"Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
# Layout prompt additions
|
| 71 |
+
LAYOUT_PROMPT_ADDITIONS = {
|
| 72 |
+
"Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
|
| 73 |
+
"Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
|
| 74 |
+
"Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
# Content themes for subject tag extraction
|
| 78 |
+
CONTENT_THEMES = {
|
| 79 |
+
"Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"],
|
| 80 |
+
"Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"],
|
| 81 |
+
"Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"],
|
| 82 |
+
"Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"],
|
| 83 |
+
"Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"],
|
| 84 |
+
"Education": ["education", "school", "university", "college", "learning", "student", "teach"],
|
| 85 |
+
"Politics": ["government", "political", "policy", "administration", "election", "legislature"],
|
| 86 |
+
"Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"],
|
| 87 |
+
"Social": ["society", "community", "social", "culture", "tradition", "customs"],
|
| 88 |
+
"Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"],
|
| 89 |
+
"Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"],
|
| 90 |
+
"Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"],
|
| 91 |
+
"Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"],
|
| 92 |
+
"Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"],
|
| 93 |
+
"Correspondence": ["letter", "mail", "correspondence", "message", "communication"]
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# Period tags based on year ranges
|
| 97 |
+
PERIOD_TAGS = {
|
| 98 |
+
(0, 1799): "Pre-1800s",
|
| 99 |
+
(1800, 1849): "Early 19th Century",
|
| 100 |
+
(1850, 1899): "Late 19th Century",
|
| 101 |
+
(1900, 1949): "Early 20th Century",
|
| 102 |
+
(1950, 2099): "Modern Era"
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
# Default fallback tags
|
| 106 |
+
DEFAULT_TAGS = ["Document", "Historical", "Text"]
|
| 107 |
+
GENERIC_TAGS = ["Archive", "Content", "Record"]
|
| 108 |
+
|
| 109 |
+
# UI constants
|
| 110 |
+
PROGRESS_DELAY = 0.8 # Seconds to show completion message
|
error_handler.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import time
|
| 4 |
+
from constants import MAX_FILE_SIZE_MB
|
| 5 |
+
|
| 6 |
+
# Configure logging
|
| 7 |
+
logger = logging.getLogger("error_handler")
|
| 8 |
+
logger.setLevel(logging.INFO)
|
| 9 |
+
|
| 10 |
+
def handle_ocr_error(exception, progress_reporter=None):
|
| 11 |
+
"""
|
| 12 |
+
Handle OCR processing errors and provide user-friendly messages
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
exception: The exception that occurred
|
| 16 |
+
progress_reporter: ProgressReporter instance for UI updates
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
str: User-friendly error message
|
| 20 |
+
"""
|
| 21 |
+
error_message = str(exception)
|
| 22 |
+
|
| 23 |
+
# Complete progress reporting if provided
|
| 24 |
+
if progress_reporter:
|
| 25 |
+
progress_reporter.complete(success=False)
|
| 26 |
+
|
| 27 |
+
# Check for specific error types and provide helpful user-facing messages
|
| 28 |
+
if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
|
| 29 |
+
friendly_message = "The AI service is currently experiencing high demand. Please try again in a few minutes."
|
| 30 |
+
logger.error(f"Rate limit error: {error_message}")
|
| 31 |
+
return friendly_message
|
| 32 |
+
elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
|
| 33 |
+
friendly_message = "The API usage quota has been reached. Please check your API key and subscription limits."
|
| 34 |
+
logger.error(f"API quota error: {error_message}")
|
| 35 |
+
return friendly_message
|
| 36 |
+
elif "timeout" in error_message.lower() or "timed out" in error_message.lower():
|
| 37 |
+
friendly_message = "The request timed out. This may be due to a large document or high server load. Please try again or use a smaller document."
|
| 38 |
+
logger.error(f"Timeout error: {error_message}")
|
| 39 |
+
return friendly_message
|
| 40 |
+
elif "file size" in error_message.lower() or "too large" in error_message.lower():
|
| 41 |
+
friendly_message = f"The file is too large. Maximum file size is {MAX_FILE_SIZE_MB}MB."
|
| 42 |
+
logger.error(f"File size error: {error_message}")
|
| 43 |
+
return friendly_message
|
| 44 |
+
else:
|
| 45 |
+
# Generic error message for other errors
|
| 46 |
+
logger.error(f"OCR processing error: {error_message}", exc_info=True)
|
| 47 |
+
return f"An error occurred during processing: {error_message}"
|
| 48 |
+
|
| 49 |
+
def check_file_size(file_bytes):
|
| 50 |
+
"""
|
| 51 |
+
Check if file size is within limits
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
file_bytes: File content as bytes
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
tuple: (is_valid, file_size_mb, error_message)
|
| 58 |
+
"""
|
| 59 |
+
file_size_mb = len(file_bytes) / (1024 * 1024)
|
| 60 |
+
|
| 61 |
+
if file_size_mb > MAX_FILE_SIZE_MB:
|
| 62 |
+
error_message = f"File size {file_size_mb:.2f} MB exceeds limit of {MAX_FILE_SIZE_MB} MB"
|
| 63 |
+
return False, file_size_mb, error_message
|
| 64 |
+
|
| 65 |
+
return True, file_size_mb, None
|
ocr_processing.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import hashlib
|
| 3 |
+
import tempfile
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import logging
|
| 6 |
+
import time
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from structured_ocr import StructuredOCR
|
| 10 |
+
from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
|
| 11 |
+
from preprocessing import apply_preprocessing_to_file
|
| 12 |
+
from error_handler import handle_ocr_error, check_file_size
|
| 13 |
+
|
| 14 |
+
# Configure logging
|
| 15 |
+
logger = logging.getLogger("ocr_processing")
|
| 16 |
+
logger.setLevel(logging.INFO)
|
| 17 |
+
|
| 18 |
+
@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
|
| 19 |
+
def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None):
|
| 20 |
+
"""
|
| 21 |
+
Cached version of OCR processing to reuse results
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
file_path: Path to the file to process
|
| 25 |
+
file_type: Type of file (pdf or image)
|
| 26 |
+
use_vision: Whether to use vision model
|
| 27 |
+
file_size_mb: File size in MB
|
| 28 |
+
cache_key: Cache key for the file
|
| 29 |
+
preprocessing_options_hash: Hash of preprocessing options
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
dict: OCR result
|
| 33 |
+
"""
|
| 34 |
+
# Initialize OCR processor
|
| 35 |
+
processor = StructuredOCR()
|
| 36 |
+
|
| 37 |
+
# Process the file
|
| 38 |
+
with timing(f"OCR processing of {file_type} file"):
|
| 39 |
+
result = processor.process_file(
|
| 40 |
+
file_path,
|
| 41 |
+
file_type=file_type,
|
| 42 |
+
use_vision=use_vision,
|
| 43 |
+
file_size_mb=file_size_mb
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
return result
|
| 47 |
+
|
| 48 |
+
def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_reporter=None,
|
| 49 |
+
pdf_dpi=150, max_pages=3, pdf_rotation=0, custom_prompt=None, perf_mode="Quality"):
|
| 50 |
+
"""
|
| 51 |
+
Process the uploaded file and return the OCR results
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
uploaded_file: The uploaded file to process
|
| 55 |
+
use_vision: Whether to use vision model
|
| 56 |
+
preprocessing_options: Dictionary of preprocessing options
|
| 57 |
+
progress_reporter: ProgressReporter instance for UI updates
|
| 58 |
+
pdf_dpi: DPI for PDF conversion
|
| 59 |
+
max_pages: Maximum number of pages to process
|
| 60 |
+
pdf_rotation: PDF rotation value
|
| 61 |
+
custom_prompt: Custom prompt for OCR
|
| 62 |
+
perf_mode: Performance mode (Quality or Speed)
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
dict: OCR result
|
| 66 |
+
"""
|
| 67 |
+
if preprocessing_options is None:
|
| 68 |
+
preprocessing_options = {}
|
| 69 |
+
|
| 70 |
+
# Create a container for progress indicators if not provided
|
| 71 |
+
if progress_reporter is None:
|
| 72 |
+
from ui_components import ProgressReporter
|
| 73 |
+
progress_reporter = ProgressReporter(st.empty()).setup()
|
| 74 |
+
|
| 75 |
+
# Initialize temporary file paths list
|
| 76 |
+
temp_file_paths = []
|
| 77 |
+
|
| 78 |
+
try:
|
| 79 |
+
# Check if file size exceeds maximum allowed size
|
| 80 |
+
is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
|
| 81 |
+
if not is_valid:
|
| 82 |
+
progress_reporter.complete(success=False)
|
| 83 |
+
st.error(error_message)
|
| 84 |
+
return {
|
| 85 |
+
"file_name": uploaded_file.name,
|
| 86 |
+
"topics": ["Document"],
|
| 87 |
+
"languages": ["English"],
|
| 88 |
+
"error": error_message,
|
| 89 |
+
"ocr_contents": {
|
| 90 |
+
"error": error_message,
|
| 91 |
+
"partial_text": "Document could not be processed due to size limitations."
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
# Update progress
|
| 96 |
+
progress_reporter.update(10, "Initializing OCR processor...")
|
| 97 |
+
|
| 98 |
+
# Determine file type from extension
|
| 99 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
| 100 |
+
file_type = "pdf" if file_ext == ".pdf" else "image"
|
| 101 |
+
file_bytes = uploaded_file.getvalue()
|
| 102 |
+
|
| 103 |
+
# For PDFs, we need to handle differently
|
| 104 |
+
if file_type == "pdf":
|
| 105 |
+
progress_reporter.update(20, "Converting PDF to images...")
|
| 106 |
+
|
| 107 |
+
# Process PDF with direct handling
|
| 108 |
+
progress_reporter.update(30, "Processing PDF with OCR...")
|
| 109 |
+
|
| 110 |
+
# Create a temporary file for processing
|
| 111 |
+
temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
|
| 112 |
+
with open(temp_path, 'wb') as f:
|
| 113 |
+
f.write(file_bytes)
|
| 114 |
+
temp_file_paths.append(temp_path)
|
| 115 |
+
|
| 116 |
+
# Generate cache key
|
| 117 |
+
cache_key = generate_cache_key(
|
| 118 |
+
file_bytes,
|
| 119 |
+
file_type,
|
| 120 |
+
use_vision,
|
| 121 |
+
preprocessing_options,
|
| 122 |
+
pdf_rotation,
|
| 123 |
+
custom_prompt
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Process with cached function if possible
|
| 127 |
+
try:
|
| 128 |
+
result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
|
| 129 |
+
progress_reporter.update(90, "Finalizing results...")
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
|
| 132 |
+
progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
|
| 133 |
+
|
| 134 |
+
# If caching fails, process directly
|
| 135 |
+
processor = StructuredOCR()
|
| 136 |
+
|
| 137 |
+
# Apply performance mode settings
|
| 138 |
+
if perf_mode == "Speed":
|
| 139 |
+
# Override settings for faster processing
|
| 140 |
+
if pdf_dpi > 100:
|
| 141 |
+
pdf_dpi = 100 # Lower DPI for speed
|
| 142 |
+
|
| 143 |
+
# Process directly with optimized settings
|
| 144 |
+
result = processor.process_file(
|
| 145 |
+
file_path=temp_path,
|
| 146 |
+
file_type="pdf",
|
| 147 |
+
use_vision=use_vision,
|
| 148 |
+
custom_prompt=custom_prompt,
|
| 149 |
+
file_size_mb=file_size_mb,
|
| 150 |
+
pdf_rotation=pdf_rotation
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
progress_reporter.update(90, "Finalizing results...")
|
| 154 |
+
else:
|
| 155 |
+
# For image files
|
| 156 |
+
progress_reporter.update(20, "Preparing image for processing...")
|
| 157 |
+
|
| 158 |
+
# Apply preprocessing if needed
|
| 159 |
+
temp_path, preprocessing_applied = apply_preprocessing_to_file(
|
| 160 |
+
file_bytes,
|
| 161 |
+
file_ext,
|
| 162 |
+
preprocessing_options,
|
| 163 |
+
temp_file_paths
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
if preprocessing_applied:
|
| 167 |
+
progress_reporter.update(30, "Applied image preprocessing...")
|
| 168 |
+
|
| 169 |
+
# Generate cache key
|
| 170 |
+
cache_key = generate_cache_key(
|
| 171 |
+
open(temp_path, 'rb').read(),
|
| 172 |
+
file_type,
|
| 173 |
+
use_vision,
|
| 174 |
+
preprocessing_options,
|
| 175 |
+
0, # No rotation for images (handled in preprocessing)
|
| 176 |
+
custom_prompt
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# Process the file using cached function if possible
|
| 180 |
+
progress_reporter.update(50, "Processing document with OCR...")
|
| 181 |
+
try:
|
| 182 |
+
result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
|
| 183 |
+
progress_reporter.update(80, "Analyzing document structure...")
|
| 184 |
+
progress_reporter.update(90, "Finalizing results...")
|
| 185 |
+
except Exception as e:
|
| 186 |
+
logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
|
| 187 |
+
progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
|
| 188 |
+
|
| 189 |
+
# If caching fails, process directly
|
| 190 |
+
processor = StructuredOCR()
|
| 191 |
+
|
| 192 |
+
# Apply performance mode settings
|
| 193 |
+
if perf_mode == "Speed":
|
| 194 |
+
# Use simpler processing for speed
|
| 195 |
+
pass # Any speed optimizations would be handled by the StructuredOCR class
|
| 196 |
+
|
| 197 |
+
result = processor.process_file(
|
| 198 |
+
file_path=temp_path,
|
| 199 |
+
file_type=file_type,
|
| 200 |
+
use_vision=use_vision,
|
| 201 |
+
custom_prompt=custom_prompt,
|
| 202 |
+
file_size_mb=file_size_mb
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
progress_reporter.update(90, "Finalizing results...")
|
| 206 |
+
|
| 207 |
+
# Add additional metadata to result
|
| 208 |
+
result = process_result(result, uploaded_file, preprocessing_options)
|
| 209 |
+
|
| 210 |
+
# Complete progress
|
| 211 |
+
progress_reporter.complete()
|
| 212 |
+
|
| 213 |
+
return result
|
| 214 |
+
except Exception as e:
|
| 215 |
+
# Handle errors
|
| 216 |
+
error_message = handle_ocr_error(e, progress_reporter)
|
| 217 |
+
|
| 218 |
+
# Return error result
|
| 219 |
+
return {
|
| 220 |
+
"file_name": uploaded_file.name,
|
| 221 |
+
"topics": ["Document"],
|
| 222 |
+
"languages": ["English"],
|
| 223 |
+
"error": error_message,
|
| 224 |
+
"ocr_contents": {
|
| 225 |
+
"error": f"Failed to process file: {error_message}",
|
| 226 |
+
"partial_text": "Document could not be processed due to an error."
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
finally:
|
| 230 |
+
# Clean up temporary files
|
| 231 |
+
for temp_path in temp_file_paths:
|
| 232 |
+
try:
|
| 233 |
+
if os.path.exists(temp_path):
|
| 234 |
+
os.unlink(temp_path)
|
| 235 |
+
logger.info(f"Removed temporary file: {temp_path}")
|
| 236 |
+
except Exception as e:
|
| 237 |
+
logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
|
| 238 |
+
|
| 239 |
+
def process_result(result, uploaded_file, preprocessing_options=None):
|
| 240 |
+
"""
|
| 241 |
+
Process OCR result to add metadata, tags, etc.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
result: OCR result dictionary
|
| 245 |
+
uploaded_file: The uploaded file
|
| 246 |
+
preprocessing_options: Dictionary of preprocessing options
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
dict: Processed OCR result
|
| 250 |
+
"""
|
| 251 |
+
# Add timestamp
|
| 252 |
+
result['timestamp'] = format_timestamp()
|
| 253 |
+
|
| 254 |
+
# Add processing time if not already present
|
| 255 |
+
if 'processing_time' not in result:
|
| 256 |
+
result['processing_time'] = 0.0
|
| 257 |
+
|
| 258 |
+
# Generate descriptive filename
|
| 259 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
| 260 |
+
result['descriptive_file_name'] = create_descriptive_filename(
|
| 261 |
+
uploaded_file.name,
|
| 262 |
+
result,
|
| 263 |
+
file_ext,
|
| 264 |
+
preprocessing_options
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Extract raw text from OCR contents
|
| 268 |
+
raw_text = ""
|
| 269 |
+
if 'ocr_contents' in result:
|
| 270 |
+
if 'raw_text' in result['ocr_contents']:
|
| 271 |
+
raw_text = result['ocr_contents']['raw_text']
|
| 272 |
+
elif 'content' in result['ocr_contents']:
|
| 273 |
+
raw_text = result['ocr_contents']['content']
|
| 274 |
+
|
| 275 |
+
# Extract subject tags if not already present or enhance existing ones
|
| 276 |
+
if 'topics' not in result or not result['topics']:
|
| 277 |
+
result['topics'] = extract_subject_tags(result, raw_text, preprocessing_options)
|
| 278 |
+
|
| 279 |
+
return result
|
preprocessing.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import io
|
| 3 |
+
import cv2
|
| 4 |
+
import numpy as np
|
| 5 |
+
import tempfile
|
| 6 |
+
from PIL import Image, ImageEnhance, ImageFilter
|
| 7 |
+
from pdf2image import convert_from_bytes
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
# Configure logging
|
| 12 |
+
logger = logging.getLogger("preprocessing")
|
| 13 |
+
logger.setLevel(logging.INFO)
|
| 14 |
+
|
| 15 |
+
@st.cache_data(ttl=24*3600, show_spinner=False) # Cache for 24 hours
|
| 16 |
+
def convert_pdf_to_images(pdf_bytes, dpi=150, rotation=0):
|
| 17 |
+
"""Convert PDF bytes to a list of images with caching"""
|
| 18 |
+
try:
|
| 19 |
+
images = convert_from_bytes(pdf_bytes, dpi=dpi)
|
| 20 |
+
|
| 21 |
+
# Apply rotation if specified
|
| 22 |
+
if rotation != 0 and images:
|
| 23 |
+
rotated_images = []
|
| 24 |
+
for img in images:
|
| 25 |
+
rotated_img = img.rotate(rotation, expand=True, resample=Image.BICUBIC)
|
| 26 |
+
rotated_images.append(rotated_img)
|
| 27 |
+
return rotated_images
|
| 28 |
+
|
| 29 |
+
return images
|
| 30 |
+
except Exception as e:
|
| 31 |
+
st.error(f"Error converting PDF: {str(e)}")
|
| 32 |
+
logger.error(f"PDF conversion error: {str(e)}")
|
| 33 |
+
return []
|
| 34 |
+
|
| 35 |
+
@st.cache_data(ttl=24*3600, show_spinner=False, hash_funcs={dict: lambda x: str(sorted(x.items()))})
|
| 36 |
+
def preprocess_image(image_bytes, preprocessing_options):
|
| 37 |
+
"""Preprocess image with selected options optimized for historical document OCR quality"""
|
| 38 |
+
# Setup basic console logging
|
| 39 |
+
logger = logging.getLogger("image_preprocessor")
|
| 40 |
+
logger.setLevel(logging.INFO)
|
| 41 |
+
|
| 42 |
+
# Log which preprocessing options are being applied
|
| 43 |
+
logger.info(f"Preprocessing image with options: {preprocessing_options}")
|
| 44 |
+
|
| 45 |
+
# Convert bytes to PIL Image
|
| 46 |
+
image = Image.open(io.BytesIO(image_bytes))
|
| 47 |
+
|
| 48 |
+
# Check for alpha channel (RGBA) and convert to RGB if needed
|
| 49 |
+
if image.mode == 'RGBA':
|
| 50 |
+
# Convert RGBA to RGB by compositing the image onto a white background
|
| 51 |
+
background = Image.new('RGB', image.size, (255, 255, 255))
|
| 52 |
+
background.paste(image, mask=image.split()[3]) # 3 is the alpha channel
|
| 53 |
+
image = background
|
| 54 |
+
logger.info("Converted RGBA image to RGB")
|
| 55 |
+
elif image.mode not in ('RGB', 'L'):
|
| 56 |
+
# Convert other modes to RGB as well
|
| 57 |
+
image = image.convert('RGB')
|
| 58 |
+
logger.info(f"Converted {image.mode} image to RGB")
|
| 59 |
+
|
| 60 |
+
# Apply rotation if specified
|
| 61 |
+
if preprocessing_options.get("rotation", 0) != 0:
|
| 62 |
+
rotation_degrees = preprocessing_options.get("rotation")
|
| 63 |
+
image = image.rotate(rotation_degrees, expand=True, resample=Image.BICUBIC)
|
| 64 |
+
|
| 65 |
+
# Resize large images while preserving details important for OCR
|
| 66 |
+
width, height = image.size
|
| 67 |
+
max_dimension = max(width, height)
|
| 68 |
+
|
| 69 |
+
# Less aggressive resizing to preserve document details
|
| 70 |
+
if max_dimension > 2500:
|
| 71 |
+
scale_factor = 2500 / max_dimension
|
| 72 |
+
new_width = int(width * scale_factor)
|
| 73 |
+
new_height = int(height * scale_factor)
|
| 74 |
+
# Use LANCZOS for better quality preservation
|
| 75 |
+
image = image.resize((new_width, new_height), Image.LANCZOS)
|
| 76 |
+
|
| 77 |
+
img_array = np.array(image)
|
| 78 |
+
|
| 79 |
+
# Apply preprocessing based on selected options with settings optimized for historical documents
|
| 80 |
+
document_type = preprocessing_options.get("document_type", "standard")
|
| 81 |
+
|
| 82 |
+
# Process grayscale option first as it's a common foundation
|
| 83 |
+
if preprocessing_options.get("grayscale", False):
|
| 84 |
+
if len(img_array.shape) == 3: # Only convert if it's not already grayscale
|
| 85 |
+
if document_type == "handwritten":
|
| 86 |
+
# Enhanced grayscale processing for handwritten documents
|
| 87 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 88 |
+
# Apply adaptive histogram equalization to enhance handwriting
|
| 89 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
|
| 90 |
+
img_array = clahe.apply(img_array)
|
| 91 |
+
else:
|
| 92 |
+
# Standard grayscale for printed documents
|
| 93 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 94 |
+
|
| 95 |
+
# Convert back to RGB for further processing
|
| 96 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
| 97 |
+
|
| 98 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
| 99 |
+
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 100)
|
| 100 |
+
image = Image.fromarray(img_array)
|
| 101 |
+
enhancer = ImageEnhance.Contrast(image)
|
| 102 |
+
image = enhancer.enhance(contrast_factor)
|
| 103 |
+
img_array = np.array(image)
|
| 104 |
+
|
| 105 |
+
if preprocessing_options.get("denoise", False):
|
| 106 |
+
try:
|
| 107 |
+
# Apply appropriate denoising based on document type
|
| 108 |
+
if document_type == "handwritten":
|
| 109 |
+
# Very light denoising for handwritten documents to preserve pen strokes
|
| 110 |
+
if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
|
| 111 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 3, 3, 5, 9)
|
| 112 |
+
else: # Grayscale image
|
| 113 |
+
img_array = cv2.fastNlMeansDenoising(img_array, None, 3, 7, 21)
|
| 114 |
+
else:
|
| 115 |
+
# Standard denoising for printed documents
|
| 116 |
+
if len(img_array.shape) == 3 and img_array.shape[2] == 3: # Color image
|
| 117 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 5, 5, 7, 21)
|
| 118 |
+
else: # Grayscale image
|
| 119 |
+
img_array = cv2.fastNlMeansDenoising(img_array, None, 5, 7, 21)
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.error(f"Denoising error: {str(e)}, falling back to standard processing")
|
| 122 |
+
|
| 123 |
+
# Convert back to PIL Image
|
| 124 |
+
processed_image = Image.fromarray(img_array)
|
| 125 |
+
|
| 126 |
+
# Higher quality for OCR processing
|
| 127 |
+
byte_io = io.BytesIO()
|
| 128 |
+
try:
|
| 129 |
+
# Make sure the image is in RGB mode before saving as JPEG
|
| 130 |
+
if processed_image.mode not in ('RGB', 'L'):
|
| 131 |
+
processed_image = processed_image.convert('RGB')
|
| 132 |
+
|
| 133 |
+
processed_image.save(byte_io, format='JPEG', quality=92, optimize=True)
|
| 134 |
+
byte_io.seek(0)
|
| 135 |
+
|
| 136 |
+
logger.info(f"Preprocessing complete. Original image mode: {image.mode}, processed mode: {processed_image.mode}")
|
| 137 |
+
logger.info(f"Original size: {len(image_bytes)/1024:.1f}KB, processed size: {len(byte_io.getvalue())/1024:.1f}KB")
|
| 138 |
+
|
| 139 |
+
return byte_io.getvalue()
|
| 140 |
+
except Exception as e:
|
| 141 |
+
logger.error(f"Error saving processed image: {str(e)}")
|
| 142 |
+
# Fallback to original image
|
| 143 |
+
logger.info("Using original image as fallback")
|
| 144 |
+
image_io = io.BytesIO()
|
| 145 |
+
image.save(image_io, format='JPEG', quality=92)
|
| 146 |
+
image_io.seek(0)
|
| 147 |
+
return image_io.getvalue()
|
| 148 |
+
|
| 149 |
+
def create_temp_file(content, suffix, temp_file_paths):
|
| 150 |
+
"""Create a temporary file and track it for cleanup"""
|
| 151 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 152 |
+
tmp.write(content)
|
| 153 |
+
temp_path = tmp.name
|
| 154 |
+
# Track temporary file for cleanup
|
| 155 |
+
temp_file_paths.append(temp_path)
|
| 156 |
+
logger.info(f"Created temporary file: {temp_path}")
|
| 157 |
+
return temp_path
|
| 158 |
+
|
| 159 |
+
def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, temp_file_paths):
|
| 160 |
+
"""Apply preprocessing to file and return path to processed file"""
|
| 161 |
+
# Check if any preprocessing options with boolean values are True, or if any non-boolean values are non-default
|
| 162 |
+
has_preprocessing = (
|
| 163 |
+
preprocessing_options.get("grayscale", False) or
|
| 164 |
+
preprocessing_options.get("denoise", False) or
|
| 165 |
+
preprocessing_options.get("contrast", 0) != 0 or
|
| 166 |
+
preprocessing_options.get("rotation", 0) != 0 or
|
| 167 |
+
preprocessing_options.get("document_type", "standard") != "standard"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
if has_preprocessing:
|
| 171 |
+
# Apply preprocessing
|
| 172 |
+
processed_bytes = preprocess_image(file_bytes, preprocessing_options)
|
| 173 |
+
|
| 174 |
+
# Save processed image to temp file
|
| 175 |
+
temp_path = create_temp_file(processed_bytes, file_ext, temp_file_paths)
|
| 176 |
+
return temp_path, True # Return path and flag indicating preprocessing was applied
|
| 177 |
+
else:
|
| 178 |
+
# No preprocessing needed, just save the original file
|
| 179 |
+
temp_path = create_temp_file(file_bytes, file_ext, temp_file_paths)
|
| 180 |
+
return temp_path, False # Return path and flag indicating no preprocessing was applied
|
ui/custom.css
CHANGED
|
@@ -1,395 +1,282 @@
|
|
| 1 |
-
/*
|
| 2 |
|
| 3 |
-
/*
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
border-left: 3px solid #5c6bc0;
|
| 8 |
-
font-size: 0.9rem;
|
| 9 |
}
|
| 10 |
|
| 11 |
-
/*
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
}
|
| 17 |
|
| 18 |
-
/*
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
border: 1px solid #e0e0e0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
border-radius: 4px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
padding: 15px;
|
| 25 |
margin-bottom: 15px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
|
| 28 |
.result-header {
|
| 29 |
display: flex;
|
| 30 |
justify-content: space-between;
|
| 31 |
margin-bottom: 10px;
|
| 32 |
-
padding-bottom: 5px;
|
| 33 |
-
border-bottom: 1px solid #e0e0e0;
|
| 34 |
}
|
| 35 |
|
| 36 |
.result-filename {
|
| 37 |
font-weight: bold;
|
| 38 |
-
font-size:
|
| 39 |
}
|
| 40 |
|
| 41 |
.result-date {
|
| 42 |
-
font-size: 0.9rem;
|
| 43 |
color: #666;
|
|
|
|
| 44 |
}
|
| 45 |
|
| 46 |
.result-metadata {
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
gap: 8px;
|
| 50 |
-
margin-bottom: 10px;
|
| 51 |
}
|
| 52 |
|
| 53 |
.result-tag {
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
| 59 |
}
|
| 60 |
|
| 61 |
.selected-result-container {
|
| 62 |
-
|
| 63 |
-
border-radius: 4px;
|
| 64 |
padding: 20px;
|
| 65 |
-
|
|
|
|
|
|
|
| 66 |
}
|
| 67 |
|
| 68 |
.selected-result-title {
|
| 69 |
-
font-size:
|
| 70 |
font-weight: bold;
|
| 71 |
-
|
| 72 |
}
|
| 73 |
|
| 74 |
-
/*
|
| 75 |
-
.
|
| 76 |
-
|
| 77 |
-
margin-bottom: 10px !important;
|
| 78 |
}
|
| 79 |
|
| 80 |
-
.
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
object-fit: contain !important;
|
| 84 |
}
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
padding-top: 5px !important;
|
| 90 |
-
padding-bottom: 5px !important;
|
| 91 |
}
|
| 92 |
|
| 93 |
-
.
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
}
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
| 103 |
}
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
}
|
| 109 |
|
| 110 |
-
.
|
| 111 |
-
|
| 112 |
}
|
| 113 |
|
| 114 |
-
/*
|
| 115 |
-
.
|
| 116 |
-
margin-top:
|
| 117 |
-
margin-bottom: 5px !important;
|
| 118 |
-
padding-top: 0 !important;
|
| 119 |
-
padding-bottom: 3px !important;
|
| 120 |
-
line-height: 1.2 !important;
|
| 121 |
-
font-weight: 600 !important;
|
| 122 |
}
|
| 123 |
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
}
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
}
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
/* Direct child styling to prevent nested containers */
|
| 145 |
-
.element-container > .metadata-container {
|
| 146 |
-
margin-top: 0 !important;
|
| 147 |
-
}
|
| 148 |
-
|
| 149 |
-
/* Fix spacing for headings above metadata container */
|
| 150 |
-
.element-container h3 + div .metadata-container,
|
| 151 |
-
.element-container h1 + div .metadata-container,
|
| 152 |
-
.element-container h2 + div .metadata-container,
|
| 153 |
-
.stHeading + div div {
|
| 154 |
-
margin-top: 0 !important;
|
| 155 |
-
}
|
| 156 |
-
|
| 157 |
-
/* Fix for subheader and metadata container spacing */
|
| 158 |
-
.stHeading ~ div {
|
| 159 |
-
margin-top: -10px !important;
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
/* Remove excess space between metadata heading and content */
|
| 163 |
-
.stMarkdown + div div.element-container,
|
| 164 |
-
.stMarkdown + div,
|
| 165 |
-
.stHeading + div,
|
| 166 |
-
.stHeading + div div.element-container,
|
| 167 |
-
header + div.stMarkdown + div,
|
| 168 |
-
[data-testid="stHeader"] + div,
|
| 169 |
-
.stHeading + * {
|
| 170 |
-
margin-top: 0 !important;
|
| 171 |
-
padding-top: 0 !important;
|
| 172 |
-
}
|
| 173 |
-
|
| 174 |
-
/* PDF container fixes */
|
| 175 |
-
.stExpander .streamlit-expanderContent {
|
| 176 |
-
max-width: 100% !important;
|
| 177 |
-
overflow: visible !important;
|
| 178 |
-
}
|
| 179 |
-
|
| 180 |
-
/* Fix placement of fullscreen buttons, especially in expanders */
|
| 181 |
-
.element-container .stImage .stExpander button[title="View fullscreen"] {
|
| 182 |
-
position: absolute !important;
|
| 183 |
-
top: 5px !important;
|
| 184 |
-
right: 5px !important;
|
| 185 |
-
}
|
| 186 |
-
|
| 187 |
-
/* Fix PDF preview container */
|
| 188 |
-
.stPdfViewerContent,
|
| 189 |
-
.stPdfViewer,
|
| 190 |
-
.stPdfViewerPagesContainer {
|
| 191 |
-
width: 100% !important;
|
| 192 |
-
max-width: 100% !important;
|
| 193 |
-
overflow: visible !important;
|
| 194 |
-
}
|
| 195 |
-
|
| 196 |
-
/* Fix for expandable content */
|
| 197 |
-
.stExpander > div[data-testid="stExpander"] {
|
| 198 |
-
max-width: 100% !important;
|
| 199 |
-
overflow: visible !important;
|
| 200 |
-
}
|
| 201 |
-
|
| 202 |
-
/* Fix positioning for fullscreen buttons in image containers */
|
| 203 |
-
.stImage button[title="View fullscreen"] {
|
| 204 |
-
position: absolute !important;
|
| 205 |
-
top: 5px !important;
|
| 206 |
-
right: 5px !important;
|
| 207 |
-
z-index: 1000 !important;
|
| 208 |
-
visibility: visible !important;
|
| 209 |
-
opacity: 1 !important;
|
| 210 |
-
width: 28px !important;
|
| 211 |
-
height: 28px !important;
|
| 212 |
-
padding: 0 !important;
|
| 213 |
-
margin: 0 !important;
|
| 214 |
-
background-color: rgba(255, 255, 255, 0.7) !important;
|
| 215 |
-
border-radius: 4px !important;
|
| 216 |
-
display: flex !important;
|
| 217 |
-
align-items: center !important;
|
| 218 |
-
justify-content: center !important;
|
| 219 |
-
}
|
| 220 |
-
|
| 221 |
-
/* Fix fullscreen button styling */
|
| 222 |
-
button[title="View fullscreen"],
|
| 223 |
-
button.streamlit-expanderHeader {
|
| 224 |
-
z-index: 999 !important;
|
| 225 |
-
visibility: visible !important;
|
| 226 |
-
opacity: 1 !important;
|
| 227 |
-
border-radius: 4px !important;
|
| 228 |
-
position: absolute !important;
|
| 229 |
-
top: 5px !important;
|
| 230 |
-
right: 5px !important;
|
| 231 |
-
width: 28px !important;
|
| 232 |
-
height: 28px !important;
|
| 233 |
-
padding: 0 !important;
|
| 234 |
-
margin: 0 !important;
|
| 235 |
-
background-color: rgba(255, 255, 255, 0.7) !important;
|
| 236 |
-
display: flex !important;
|
| 237 |
-
align-items: center !important;
|
| 238 |
-
justify-content: center !important;
|
| 239 |
-
}
|
| 240 |
-
|
| 241 |
-
/* Make text visible in Previous Results tab - ensure high contrast */
|
| 242 |
-
.previous-results-container h3,
|
| 243 |
-
.previous-results-container p,
|
| 244 |
-
.previous-results-container .result-filename,
|
| 245 |
-
.previous-results-container .result-date,
|
| 246 |
-
.previous-results-container .result-tag {
|
| 247 |
-
color: #000000 !important;
|
| 248 |
-
text-shadow: none !important;
|
| 249 |
-
}
|
| 250 |
-
|
| 251 |
-
/* No Results styling with proper contrast */
|
| 252 |
-
.previous-results-container[style*="text-align: center"] {
|
| 253 |
-
background-color: #f0f2f6 !important;
|
| 254 |
-
border-radius: 8px !important;
|
| 255 |
-
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
|
| 256 |
-
}
|
| 257 |
-
|
| 258 |
-
/* Additional image fixes for all containers */
|
| 259 |
-
.document-content img,
|
| 260 |
-
.markdown-text-container img,
|
| 261 |
-
.page-text-content img,
|
| 262 |
-
.image-container img,
|
| 263 |
-
.streamlit-expanderContent img {
|
| 264 |
-
max-width: 100% !important;
|
| 265 |
-
height: auto !important;
|
| 266 |
-
object-fit: contain !important;
|
| 267 |
-
}
|
| 268 |
-
|
| 269 |
-
/* Responsive design rules */
|
| 270 |
-
/* Specific rules for mobile/small screens */
|
| 271 |
-
@media (max-width: 768px) {
|
| 272 |
-
.stExpander img,
|
| 273 |
-
.document-content img,
|
| 274 |
-
.markdown-text-container img,
|
| 275 |
-
.page-text-content img,
|
| 276 |
-
.image-container img,
|
| 277 |
-
.streamlit-expanderContent img {
|
| 278 |
-
max-width: 95% !important;
|
| 279 |
-
}
|
| 280 |
-
|
| 281 |
-
/* Improve responsive layout for example documents */
|
| 282 |
-
.stImage,
|
| 283 |
-
.css-6qob1r,
|
| 284 |
-
.css-zq5wmm,
|
| 285 |
-
.css-fg4pbf,
|
| 286 |
-
[data-testid="column"],
|
| 287 |
-
[data-testid="stHorizontalBlock"] > div {
|
| 288 |
-
margin-bottom: 20px !important;
|
| 289 |
-
padding: 0 10px !important;
|
| 290 |
-
}
|
| 291 |
-
|
| 292 |
-
.stImage img {
|
| 293 |
-
width: 100% !important;
|
| 294 |
-
max-width: 100% !important;
|
| 295 |
-
height: auto !important;
|
| 296 |
-
object-fit: contain !important;
|
| 297 |
-
}
|
| 298 |
-
|
| 299 |
-
.stColumnContainer,
|
| 300 |
-
.css-jjjwqm,
|
| 301 |
-
.css-fg4pbf,
|
| 302 |
-
[data-testid="column"] {
|
| 303 |
-
gap: 20px !important;
|
| 304 |
-
margin-bottom: 20px !important;
|
| 305 |
-
}
|
| 306 |
-
|
| 307 |
-
/* Force separate columns on mid-sized screens */
|
| 308 |
-
[data-testid="stHorizontalBlock"] {
|
| 309 |
-
flex-wrap: wrap !important;
|
| 310 |
-
}
|
| 311 |
-
|
| 312 |
-
[data-testid="stHorizontalBlock"] > div {
|
| 313 |
-
min-width: 45% !important;
|
| 314 |
-
flex: 1 1 45% !important;
|
| 315 |
-
}
|
| 316 |
-
}
|
| 317 |
-
|
| 318 |
-
/* Modern Streamlit styling - better responsive behavior */
|
| 319 |
-
.block-container {
|
| 320 |
-
padding-top: 2rem !important;
|
| 321 |
-
padding-bottom: 2rem !important;
|
| 322 |
-
}
|
| 323 |
-
|
| 324 |
-
/* Specific rules for very small screens (mobile) */
|
| 325 |
-
@media (max-width: 640px) {
|
| 326 |
-
/* Force single column on very small screens */
|
| 327 |
-
.row-widget.stHorizontal > div,
|
| 328 |
-
div[data-testid="stHorizontalBlock"] > div {
|
| 329 |
-
flex-direction: column !important;
|
| 330 |
-
width: 100% !important;
|
| 331 |
-
}
|
| 332 |
-
|
| 333 |
-
/* Critical fix for column display to prevent overlapping */
|
| 334 |
-
[data-testid="column"] {
|
| 335 |
-
width: 100% !important;
|
| 336 |
-
flex: 1 1 100% !important;
|
| 337 |
-
padding: 0 !important;
|
| 338 |
-
min-width: 100% !important;
|
| 339 |
-
max-width: 100% !important;
|
| 340 |
-
float: none !important;
|
| 341 |
-
clear: both !important;
|
| 342 |
-
display: block !important;
|
| 343 |
-
}
|
| 344 |
-
|
| 345 |
-
/* Enforce correct column layout for Streamlit's container elements */
|
| 346 |
-
div[data-testid="stHorizontalBlock"] {
|
| 347 |
-
flex-direction: column !important;
|
| 348 |
-
display: block !important;
|
| 349 |
-
}
|
| 350 |
-
|
| 351 |
-
/* Make images more visible on small screens */
|
| 352 |
-
.row-widget.stImage img,
|
| 353 |
-
[data-testid="stImage"] > img {
|
| 354 |
-
max-width: 100% !important;
|
| 355 |
-
width: 100% !important;
|
| 356 |
-
margin-bottom: 15px !important;
|
| 357 |
-
}
|
| 358 |
-
|
| 359 |
-
/* Fix example documents grid layout */
|
| 360 |
-
.stImage {
|
| 361 |
-
display: block !important;
|
| 362 |
-
margin-left: auto !important;
|
| 363 |
-
margin-right: auto !important;
|
| 364 |
-
width: 100% !important;
|
| 365 |
-
}
|
| 366 |
-
}
|
| 367 |
-
|
| 368 |
-
/* Fix image display in grid layout */
|
| 369 |
-
.row-widget.stImage,
|
| 370 |
-
.css-z5fcl4 {
|
| 371 |
-
text-align: center !important;
|
| 372 |
-
margin-bottom: 15px !important;
|
| 373 |
-
padding: 0 !important;
|
| 374 |
-
}
|
| 375 |
-
|
| 376 |
-
.row-widget.stImage img,
|
| 377 |
-
.css-z5fcl4 img {
|
| 378 |
-
max-height: 250px !important;
|
| 379 |
-
object-fit: contain !important;
|
| 380 |
-
border-radius: 4px !important;
|
| 381 |
-
border: 1px solid rgba(0, 0, 0, 0.1) !important;
|
| 382 |
-
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
|
| 383 |
-
}
|
| 384 |
-
|
| 385 |
-
/* Better caption styling for images */
|
| 386 |
-
.css-z5fcl4 .caption,
|
| 387 |
-
.caption,
|
| 388 |
-
[data-testid="caption"],
|
| 389 |
-
.css-1b0udgb,
|
| 390 |
-
.css-183lzff {
|
| 391 |
-
margin-top: 5px !important;
|
| 392 |
-
font-weight: 500 !important;
|
| 393 |
-
text-align: center !important;
|
| 394 |
-
font-size: 0.9rem !important;
|
| 395 |
-
}
|
|
|
|
| 1 |
+
/* Custom CSS for Historical OCR Application */
|
| 2 |
|
| 3 |
+
/* Global styles */
|
| 4 |
+
body {
|
| 5 |
+
font-family: 'Source Sans Pro', sans-serif;
|
| 6 |
+
color: #333;
|
|
|
|
|
|
|
| 7 |
}
|
| 8 |
|
| 9 |
+
/* Header styles */
|
| 10 |
+
h1, h2, h3, h4, h5, h6 {
|
| 11 |
+
font-family: 'Georgia', serif;
|
| 12 |
+
font-weight: 600;
|
| 13 |
+
color: #1E3A8A;
|
| 14 |
}
|
| 15 |
|
| 16 |
+
/* Raw text editor styling */
|
| 17 |
+
.raw-text-editor {
|
| 18 |
+
font-family: 'Courier New', monospace;
|
| 19 |
+
font-size: 14px;
|
| 20 |
+
line-height: 1.5;
|
| 21 |
+
border: 1px solid #ddd;
|
| 22 |
+
border-radius: 4px;
|
| 23 |
+
padding: 10px;
|
| 24 |
+
background-color: #f9f9f9;
|
| 25 |
+
}
|
| 26 |
|
| 27 |
+
/* Document content styling */
|
| 28 |
+
.document-content {
|
| 29 |
+
margin-top: 20px;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
.document-section {
|
| 33 |
+
margin-bottom: 20px;
|
| 34 |
+
padding: 15px;
|
| 35 |
+
background-color: #fff;
|
| 36 |
+
border-radius: 8px;
|
| 37 |
border: 1px solid #e0e0e0;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
.document-section h4 {
|
| 41 |
+
margin-top: 0;
|
| 42 |
+
margin-bottom: 10px;
|
| 43 |
+
color: #1E3A8A;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
/* Subject tag styling */
|
| 47 |
+
.subject-tag {
|
| 48 |
+
display: inline-block;
|
| 49 |
+
padding: 3px 8px;
|
| 50 |
+
border-radius: 12px;
|
| 51 |
+
font-size: 0.85em;
|
| 52 |
+
margin-right: 5px;
|
| 53 |
+
margin-bottom: 5px;
|
| 54 |
+
color: white;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.tag-time-period {
|
| 58 |
+
background-color: #1565c0;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
.tag-language {
|
| 62 |
+
background-color: #00695c;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
.tag-document-type {
|
| 66 |
+
background-color: #6a1b9a;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
.tag-subject {
|
| 70 |
+
background-color: #2e7d32;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.tag-preprocessing {
|
| 74 |
+
background-color: #e65100;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.tag-default {
|
| 78 |
+
background-color: #546e7a;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
/* Image and text side-by-side styling */
|
| 82 |
+
.image-text-container {
|
| 83 |
+
display: flex;
|
| 84 |
+
gap: 20px;
|
| 85 |
+
margin-bottom: 20px;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
.image-container {
|
| 89 |
+
flex: 1;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
.text-container {
|
| 93 |
+
flex: 1;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
/* Sidebar styling */
|
| 97 |
+
.sidebar-section {
|
| 98 |
+
margin-bottom: 20px;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
.sidebar-section h3 {
|
| 102 |
+
margin-top: 0;
|
| 103 |
+
margin-bottom: 10px;
|
| 104 |
+
font-size: 16px;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
/* Button styling */
|
| 108 |
+
.primary-button {
|
| 109 |
+
background-color: #1E88E5;
|
| 110 |
+
color: white;
|
| 111 |
+
border: none;
|
| 112 |
border-radius: 4px;
|
| 113 |
+
padding: 8px 16px;
|
| 114 |
+
font-weight: 600;
|
| 115 |
+
cursor: pointer;
|
| 116 |
+
transition: background-color 0.2s;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.primary-button:hover {
|
| 120 |
+
background-color: #1565C0;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.secondary-button {
|
| 124 |
+
background-color: #f8f9fa;
|
| 125 |
+
color: #333;
|
| 126 |
+
border: 1px solid #ddd;
|
| 127 |
+
border-radius: 4px;
|
| 128 |
+
padding: 8px 16px;
|
| 129 |
+
font-weight: 600;
|
| 130 |
+
cursor: pointer;
|
| 131 |
+
transition: background-color 0.2s;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
.secondary-button:hover {
|
| 135 |
+
background-color: #e9ecef;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
/* Processing status styling */
|
| 139 |
+
.processing-status {
|
| 140 |
+
padding: 10px 15px;
|
| 141 |
+
border-left: 4px solid #1E88E5;
|
| 142 |
+
background-color: #E3F2FD;
|
| 143 |
+
border-radius: 0 4px 4px 0;
|
| 144 |
+
margin: 10px 0;
|
| 145 |
+
font-size: 14px;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
/* Previous results styling */
|
| 149 |
+
.previous-results-container {
|
| 150 |
+
margin-top: 20px;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
.result-card {
|
| 154 |
+
background-color: #f8f9fa;
|
| 155 |
+
border-radius: 8px;
|
| 156 |
padding: 15px;
|
| 157 |
margin-bottom: 15px;
|
| 158 |
+
border: 1px solid #e0e0e0;
|
| 159 |
+
transition: all 0.2s ease;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.result-card:hover {
|
| 163 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
| 164 |
+
border-color: #c0c0c0;
|
| 165 |
}
|
| 166 |
|
| 167 |
.result-header {
|
| 168 |
display: flex;
|
| 169 |
justify-content: space-between;
|
| 170 |
margin-bottom: 10px;
|
|
|
|
|
|
|
| 171 |
}
|
| 172 |
|
| 173 |
.result-filename {
|
| 174 |
font-weight: bold;
|
| 175 |
+
font-size: 16px;
|
| 176 |
}
|
| 177 |
|
| 178 |
.result-date {
|
|
|
|
| 179 |
color: #666;
|
| 180 |
+
font-size: 14px;
|
| 181 |
}
|
| 182 |
|
| 183 |
.result-metadata {
|
| 184 |
+
margin-top: 10px;
|
| 185 |
+
font-size: 14px;
|
|
|
|
|
|
|
| 186 |
}
|
| 187 |
|
| 188 |
.result-tag {
|
| 189 |
+
margin-bottom: 5px;
|
| 190 |
+
color: #555;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
.result-action-button {
|
| 194 |
+
margin-top: 10px;
|
| 195 |
+
text-align: right;
|
| 196 |
}
|
| 197 |
|
| 198 |
.selected-result-container {
|
| 199 |
+
margin-top: 30px;
|
|
|
|
| 200 |
padding: 20px;
|
| 201 |
+
background-color: #f0f2f6;
|
| 202 |
+
border-radius: 8px;
|
| 203 |
+
border: 1px solid #d0d7de;
|
| 204 |
}
|
| 205 |
|
| 206 |
.selected-result-title {
|
| 207 |
+
font-size: 18px;
|
| 208 |
font-weight: bold;
|
| 209 |
+
color: #1E3A8A;
|
| 210 |
}
|
| 211 |
|
| 212 |
+
/* About tab styling */
|
| 213 |
+
.about-section {
|
| 214 |
+
margin-bottom: 30px;
|
|
|
|
| 215 |
}
|
| 216 |
|
| 217 |
+
.about-section h3 {
|
| 218 |
+
color: #1E3A8A;
|
| 219 |
+
margin-bottom: 10px;
|
|
|
|
| 220 |
}
|
| 221 |
|
| 222 |
+
.feature-list {
|
| 223 |
+
list-style-type: none;
|
| 224 |
+
padding-left: 0;
|
|
|
|
|
|
|
| 225 |
}
|
| 226 |
|
| 227 |
+
.feature-list li {
|
| 228 |
+
margin-bottom: 8px;
|
| 229 |
+
padding-left: 20px;
|
| 230 |
+
position: relative;
|
| 231 |
}
|
| 232 |
|
| 233 |
+
.feature-list li:before {
|
| 234 |
+
content: "•";
|
| 235 |
+
position: absolute;
|
| 236 |
+
left: 0;
|
| 237 |
+
color: #1E88E5;
|
| 238 |
}
|
| 239 |
|
| 240 |
+
/* File uploader styling */
|
| 241 |
+
.file-uploader {
|
| 242 |
+
border: 2px dashed #ddd;
|
| 243 |
+
border-radius: 8px;
|
| 244 |
+
padding: 20px;
|
| 245 |
+
text-align: center;
|
| 246 |
+
transition: border-color 0.2s;
|
| 247 |
}
|
| 248 |
|
| 249 |
+
.file-uploader:hover {
|
| 250 |
+
border-color: #1E88E5;
|
| 251 |
}
|
| 252 |
|
| 253 |
+
/* Example documents styling */
|
| 254 |
+
.example-documents {
|
| 255 |
+
margin-top: 20px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
}
|
| 257 |
|
| 258 |
+
.example-card {
|
| 259 |
+
background-color: #f8f9fa;
|
| 260 |
+
border-radius: 8px;
|
| 261 |
+
padding: 15px;
|
| 262 |
+
margin-bottom: 15px;
|
| 263 |
+
border: 1px solid #e0e0e0;
|
| 264 |
+
cursor: pointer;
|
| 265 |
+
transition: all 0.2s ease;
|
| 266 |
}
|
| 267 |
|
| 268 |
+
.example-card:hover {
|
| 269 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
| 270 |
+
border-color: #c0c0c0;
|
| 271 |
}
|
| 272 |
|
| 273 |
+
.example-title {
|
| 274 |
+
font-weight: bold;
|
| 275 |
+
font-size: 16px;
|
| 276 |
+
margin-bottom: 5px;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
.example-description {
|
| 280 |
+
font-size: 14px;
|
| 281 |
+
color: #555;
|
| 282 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/layout.py
CHANGED
|
@@ -1,27 +1,217 @@
|
|
| 1 |
-
"""
|
| 2 |
-
UI layout components for the OCR application.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import os
|
| 6 |
import streamlit as st
|
| 7 |
-
from pathlib import Path
|
| 8 |
|
| 9 |
def load_css():
|
| 10 |
-
"""Load custom CSS for the application
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
| 2 |
|
| 3 |
def load_css():
|
| 4 |
+
"""Load custom CSS for the application"""
|
| 5 |
+
st.markdown("""
|
| 6 |
+
<style>
|
| 7 |
+
/* Global styles */
|
| 8 |
+
body {
|
| 9 |
+
font-family: 'Source Sans Pro', sans-serif;
|
| 10 |
+
color: #333;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
/* Header styles */
|
| 14 |
+
h1, h2, h3, h4, h5, h6 {
|
| 15 |
+
font-family: 'Georgia', serif;
|
| 16 |
+
font-weight: 600;
|
| 17 |
+
color: #1E3A8A;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
/* Processing status container */
|
| 21 |
+
.processing-status-container {
|
| 22 |
+
padding: 10px 15px;
|
| 23 |
+
border-left: 4px solid #1E88E5;
|
| 24 |
+
background-color: #E3F2FD;
|
| 25 |
+
border-radius: 0 4px 4px 0;
|
| 26 |
+
margin: 10px 0;
|
| 27 |
+
font-size: 14px;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
/* Previous results styling */
|
| 31 |
+
.previous-results-container {
|
| 32 |
+
margin-top: 20px;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
.result-card {
|
| 36 |
+
background-color: #f8f9fa;
|
| 37 |
+
border-radius: 8px;
|
| 38 |
+
padding: 15px;
|
| 39 |
+
margin-bottom: 15px;
|
| 40 |
+
border: 1px solid #e0e0e0;
|
| 41 |
+
transition: all 0.2s ease;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.result-card:hover {
|
| 45 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
| 46 |
+
border-color: #c0c0c0;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.result-header {
|
| 50 |
+
display: flex;
|
| 51 |
+
justify-content: space-between;
|
| 52 |
+
margin-bottom: 10px;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.result-filename {
|
| 56 |
+
font-weight: bold;
|
| 57 |
+
font-size: 16px;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
.result-date {
|
| 61 |
+
color: #666;
|
| 62 |
+
font-size: 14px;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
.result-metadata {
|
| 66 |
+
margin-top: 10px;
|
| 67 |
+
font-size: 14px;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
.result-tag {
|
| 71 |
+
margin-bottom: 5px;
|
| 72 |
+
color: #555;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
.result-action-button {
|
| 76 |
+
margin-top: 10px;
|
| 77 |
+
text-align: right;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
.selected-result-container {
|
| 81 |
+
margin-top: 30px;
|
| 82 |
+
padding: 20px;
|
| 83 |
+
background-color: #f0f2f6;
|
| 84 |
+
border-radius: 8px;
|
| 85 |
+
border: 1px solid #d0d7de;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
.selected-result-title {
|
| 89 |
+
font-size: 18px;
|
| 90 |
+
font-weight: bold;
|
| 91 |
+
color: #1E3A8A;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
/* Raw text editor styling */
|
| 95 |
+
.stTextArea textarea {
|
| 96 |
+
font-family: 'Courier New', monospace;
|
| 97 |
+
font-size: 14px;
|
| 98 |
+
line-height: 1.5;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
/* Image and text side-by-side styling */
|
| 102 |
+
.image-text-container {
|
| 103 |
+
display: flex;
|
| 104 |
+
gap: 20px;
|
| 105 |
+
margin-bottom: 20px;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.image-container {
|
| 109 |
+
flex: 1;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.text-container {
|
| 113 |
+
flex: 1;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* Sidebar styling */
|
| 117 |
+
.sidebar .stRadio > div {
|
| 118 |
+
flex-direction: row;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
.sidebar .stRadio label {
|
| 122 |
+
margin-right: 10px;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
/* Optimize spacing in sidebar */
|
| 126 |
+
.sidebar .block-container {
|
| 127 |
+
padding-top: 0;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
.sidebar [data-testid="stVerticalBlock"] {
|
| 131 |
+
gap: 0;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
/* Button styling */
|
| 135 |
+
.stButton > button {
|
| 136 |
+
border-radius: 4px;
|
| 137 |
+
font-weight: 600;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
/* File uploader styling */
|
| 141 |
+
.stFileUploader > section > div {
|
| 142 |
+
min-height: 100px;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
/* Reset vertical text in file uploader */
|
| 146 |
+
.stFileUploader p,
|
| 147 |
+
.stFileUploader span,
|
| 148 |
+
.stFileUploader div p,
|
| 149 |
+
.stFileUploader div span,
|
| 150 |
+
.stFileUploader label p,
|
| 151 |
+
.stFileUploader label span,
|
| 152 |
+
.stFileUploader div[data-testid="stFileUploadDropzone"] p,
|
| 153 |
+
.stFileUploader div[data-testid="stFileUploadDropzone"] span {
|
| 154 |
+
writing-mode: horizontal-tb !important;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
/* Metadata styling */
|
| 158 |
+
.metadata-card {
|
| 159 |
+
background-color: #f8f9fa;
|
| 160 |
+
border-radius: 8px;
|
| 161 |
+
padding: 15px;
|
| 162 |
+
margin-bottom: 20px;
|
| 163 |
+
border: 1px solid #e0e0e0;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
/* Document content styling */
|
| 167 |
+
.document-content {
|
| 168 |
+
margin-top: 10px;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
/* Tab styling */
|
| 172 |
+
.stTabs [data-baseweb="tab-list"] {
|
| 173 |
+
gap: 8px;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
.stTabs [data-baseweb="tab"] {
|
| 177 |
+
padding: 8px 16px;
|
| 178 |
+
border-radius: 4px 4px 0 0;
|
| 179 |
+
}
|
| 180 |
|
| 181 |
+
/* Success message styling */
|
| 182 |
+
.stSuccess {
|
| 183 |
+
background-color: #D4EDDA;
|
| 184 |
+
color: #155724;
|
| 185 |
+
padding: 10px;
|
| 186 |
+
border-radius: 4px;
|
| 187 |
+
border-left: 5px solid #155724;
|
| 188 |
+
}
|
| 189 |
|
| 190 |
+
/* Error message styling */
|
| 191 |
+
.stError {
|
| 192 |
+
background-color: #F8D7DA;
|
| 193 |
+
color: #721C24;
|
| 194 |
+
padding: 10px;
|
| 195 |
+
border-radius: 4px;
|
| 196 |
+
border-left: 5px solid #721C24;
|
| 197 |
+
}
|
| 198 |
|
| 199 |
+
/* Info message styling */
|
| 200 |
+
.stInfo {
|
| 201 |
+
background-color: #D1ECF1;
|
| 202 |
+
color: #0C5460;
|
| 203 |
+
padding: 10px;
|
| 204 |
+
border-radius: 4px;
|
| 205 |
+
border-left: 5px solid #0C5460;
|
| 206 |
+
}
|
| 207 |
|
| 208 |
+
/* Warning message styling */
|
| 209 |
+
.stWarning {
|
| 210 |
+
background-color: #FFF3CD;
|
| 211 |
+
color: #856404;
|
| 212 |
+
padding: 10px;
|
| 213 |
+
border-radius: 4px;
|
| 214 |
+
border-left: 5px solid #856404;
|
| 215 |
+
}
|
| 216 |
+
</style>
|
| 217 |
+
""", unsafe_allow_html=True)
|
ui_components.py
ADDED
|
@@ -0,0 +1,774 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import io
|
| 4 |
+
import base64
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import json
|
| 8 |
+
from constants import (
|
| 9 |
+
DOCUMENT_TYPES,
|
| 10 |
+
DOCUMENT_LAYOUTS,
|
| 11 |
+
CUSTOM_PROMPT_TEMPLATES,
|
| 12 |
+
LAYOUT_PROMPT_ADDITIONS,
|
| 13 |
+
DEFAULT_PDF_DPI,
|
| 14 |
+
MIN_PDF_DPI,
|
| 15 |
+
MAX_PDF_DPI,
|
| 16 |
+
DEFAULT_MAX_PAGES,
|
| 17 |
+
PERFORMANCE_MODES,
|
| 18 |
+
PREPROCESSING_DOC_TYPES,
|
| 19 |
+
ROTATION_OPTIONS
|
| 20 |
+
)
|
| 21 |
+
from utils import get_base64_from_image, extract_subject_tags
|
| 22 |
+
|
| 23 |
+
class ProgressReporter:
|
| 24 |
+
"""Class to handle progress reporting in the UI"""
|
| 25 |
+
|
| 26 |
+
def __init__(self, placeholder):
|
| 27 |
+
self.placeholder = placeholder
|
| 28 |
+
self.progress_bar = None
|
| 29 |
+
self.status_text = None
|
| 30 |
+
|
| 31 |
+
def setup(self):
|
| 32 |
+
"""Setup the progress components"""
|
| 33 |
+
with self.placeholder.container():
|
| 34 |
+
self.progress_bar = st.progress(0)
|
| 35 |
+
self.status_text = st.empty()
|
| 36 |
+
return self
|
| 37 |
+
|
| 38 |
+
def update(self, percent, status_text):
|
| 39 |
+
"""Update the progress bar and status text"""
|
| 40 |
+
if self.progress_bar is not None:
|
| 41 |
+
self.progress_bar.progress(percent / 100)
|
| 42 |
+
if self.status_text is not None:
|
| 43 |
+
self.status_text.text(status_text)
|
| 44 |
+
|
| 45 |
+
def complete(self, success=True):
|
| 46 |
+
"""Complete the progress reporting"""
|
| 47 |
+
if success:
|
| 48 |
+
if self.progress_bar is not None:
|
| 49 |
+
self.progress_bar.progress(100)
|
| 50 |
+
if self.status_text is not None:
|
| 51 |
+
self.status_text.text("Processing complete!")
|
| 52 |
+
else:
|
| 53 |
+
if self.status_text is not None:
|
| 54 |
+
self.status_text.text("Processing failed.")
|
| 55 |
+
|
| 56 |
+
# Clear the progress components after a delay
|
| 57 |
+
import time
|
| 58 |
+
time.sleep(0.8) # Short delay to show completion
|
| 59 |
+
if self.progress_bar is not None:
|
| 60 |
+
self.progress_bar.empty()
|
| 61 |
+
if self.status_text is not None:
|
| 62 |
+
self.status_text.empty()
|
| 63 |
+
|
| 64 |
+
def create_sidebar_options():
|
| 65 |
+
"""Create and return sidebar options"""
|
| 66 |
+
with st.sidebar:
|
| 67 |
+
st.title("OCR Settings")
|
| 68 |
+
|
| 69 |
+
# Create a container for the sidebar options
|
| 70 |
+
with st.container():
|
| 71 |
+
# Model selection
|
| 72 |
+
st.subheader("Model Selection")
|
| 73 |
+
use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
|
| 74 |
+
|
| 75 |
+
# Performance mode
|
| 76 |
+
perf_mode = st.radio("Performance Mode", PERFORMANCE_MODES,
|
| 77 |
+
horizontal=True,
|
| 78 |
+
help="Quality: Best results but slower. Speed: Faster but may be less accurate.")
|
| 79 |
+
|
| 80 |
+
# Document type selection
|
| 81 |
+
st.subheader("Document Type")
|
| 82 |
+
doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
|
| 83 |
+
help="Select the type of document you're processing for better results")
|
| 84 |
+
|
| 85 |
+
# Document layout
|
| 86 |
+
doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS,
|
| 87 |
+
help="Select the layout of your document")
|
| 88 |
+
|
| 89 |
+
# Custom prompt
|
| 90 |
+
custom_prompt = ""
|
| 91 |
+
if doc_type != DOCUMENT_TYPES[0]: # Not auto-detect
|
| 92 |
+
# Get the template for the selected document type
|
| 93 |
+
prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "")
|
| 94 |
+
|
| 95 |
+
# Add layout information if not standard
|
| 96 |
+
if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout
|
| 97 |
+
layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "")
|
| 98 |
+
if layout_addition:
|
| 99 |
+
prompt_template += " " + layout_addition
|
| 100 |
+
|
| 101 |
+
# Set the custom prompt
|
| 102 |
+
custom_prompt = prompt_template
|
| 103 |
+
|
| 104 |
+
# Allow user to edit the prompt
|
| 105 |
+
st.markdown("**Custom Processing Instructions**")
|
| 106 |
+
custom_prompt = st.text_area("", value=custom_prompt,
|
| 107 |
+
help="Customize the instructions for processing this document",
|
| 108 |
+
height=100)
|
| 109 |
+
|
| 110 |
+
# Image preprocessing options
|
| 111 |
+
st.subheader("Image Preprocessing")
|
| 112 |
+
|
| 113 |
+
# Document type for preprocessing
|
| 114 |
+
preprocessing_doc_type = st.radio("Document Type",
|
| 115 |
+
PREPROCESSING_DOC_TYPES,
|
| 116 |
+
horizontal=True,
|
| 117 |
+
help="Select the type of document for preprocessing")
|
| 118 |
+
|
| 119 |
+
# Grayscale conversion
|
| 120 |
+
grayscale = st.checkbox("Convert to Grayscale",
|
| 121 |
+
value=False,
|
| 122 |
+
help="Convert color images to grayscale for better OCR")
|
| 123 |
+
|
| 124 |
+
# Denoise
|
| 125 |
+
denoise = st.checkbox("Denoise Image",
|
| 126 |
+
value=False,
|
| 127 |
+
help="Remove noise from the image")
|
| 128 |
+
|
| 129 |
+
# Contrast adjustment
|
| 130 |
+
contrast = st.slider("Contrast Adjustment",
|
| 131 |
+
min_value=-50,
|
| 132 |
+
max_value=50,
|
| 133 |
+
value=0,
|
| 134 |
+
step=10,
|
| 135 |
+
help="Adjust image contrast")
|
| 136 |
+
|
| 137 |
+
# Rotation
|
| 138 |
+
rotation = st.slider("Rotation",
|
| 139 |
+
min_value=-45,
|
| 140 |
+
max_value=45,
|
| 141 |
+
value=0,
|
| 142 |
+
step=5,
|
| 143 |
+
help="Rotate image if needed")
|
| 144 |
+
|
| 145 |
+
# Create preprocessing options dictionary
|
| 146 |
+
preprocessing_options = {
|
| 147 |
+
"document_type": preprocessing_doc_type,
|
| 148 |
+
"grayscale": grayscale,
|
| 149 |
+
"denoise": denoise,
|
| 150 |
+
"contrast": contrast,
|
| 151 |
+
"rotation": rotation
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
# PDF-specific options
|
| 155 |
+
st.subheader("PDF Options")
|
| 156 |
+
pdf_dpi = st.slider("PDF Resolution (DPI)",
|
| 157 |
+
min_value=MIN_PDF_DPI,
|
| 158 |
+
max_value=MAX_PDF_DPI,
|
| 159 |
+
value=DEFAULT_PDF_DPI,
|
| 160 |
+
step=25,
|
| 161 |
+
help="Higher DPI gives better quality but slower processing")
|
| 162 |
+
|
| 163 |
+
max_pages = st.number_input("Maximum Pages to Process",
|
| 164 |
+
min_value=1,
|
| 165 |
+
max_value=20,
|
| 166 |
+
value=DEFAULT_MAX_PAGES,
|
| 167 |
+
help="Limit the number of pages to process (for multi-page PDFs)")
|
| 168 |
+
|
| 169 |
+
pdf_rotation = st.radio("PDF Rotation", ROTATION_OPTIONS,
|
| 170 |
+
horizontal=True,
|
| 171 |
+
format_func=lambda x: f"{x}°",
|
| 172 |
+
help="Rotate PDF pages if needed")
|
| 173 |
+
|
| 174 |
+
# Create options dictionary
|
| 175 |
+
options = {
|
| 176 |
+
"use_vision": use_vision,
|
| 177 |
+
"perf_mode": perf_mode,
|
| 178 |
+
"pdf_dpi": pdf_dpi,
|
| 179 |
+
"max_pages": max_pages,
|
| 180 |
+
"pdf_rotation": pdf_rotation,
|
| 181 |
+
"custom_prompt": custom_prompt,
|
| 182 |
+
"preprocessing_options": preprocessing_options
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
return options
|
| 186 |
+
|
| 187 |
+
def create_file_uploader():
|
| 188 |
+
"""Create and return a file uploader"""
|
| 189 |
+
# Add app description
|
| 190 |
+
favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
|
| 191 |
+
favicon_base64 = get_base64_from_image(favicon_path)
|
| 192 |
+
st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><img src="data:image/png;base64,{favicon_base64}" width="36" height="36" alt="Scroll Icon"/> <div><h1 style="margin: 0; padding: 20px 0 0 0;">Historical Document OCR</h1></div></div>', unsafe_allow_html=True)
|
| 193 |
+
st.subheader("Made possible by Mistral AI")
|
| 194 |
+
|
| 195 |
+
# Add project framing
|
| 196 |
+
st.markdown("""
|
| 197 |
+
This tool is designed to assist scholars in historical research by extracting text from challenging documents.
|
| 198 |
+
While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
|
| 199 |
+
historical documents, particularly:
|
| 200 |
+
|
| 201 |
+
- **Historical newspapers** with complex layouts and aged text
|
| 202 |
+
- **Handwritten documents** from various time periods
|
| 203 |
+
- **Photos of archival materials** that may be difficult to read
|
| 204 |
+
|
| 205 |
+
Upload a document to get started, or explore the example documents.
|
| 206 |
+
""")
|
| 207 |
+
|
| 208 |
+
# Create file uploader
|
| 209 |
+
uploaded_file = st.file_uploader(
|
| 210 |
+
"Upload a document",
|
| 211 |
+
type=["pdf", "png", "jpg", "jpeg"],
|
| 212 |
+
help="Upload a PDF or image file for OCR processing"
|
| 213 |
+
)
|
| 214 |
+
return uploaded_file
|
| 215 |
+
|
| 216 |
+
def display_results(result, container, custom_prompt=""):
|
| 217 |
+
"""Display OCR results in the provided container"""
|
| 218 |
+
with container:
|
| 219 |
+
# Display document metadata
|
| 220 |
+
st.subheader("Document Metadata")
|
| 221 |
+
|
| 222 |
+
# Create columns for metadata
|
| 223 |
+
meta_col1, meta_col2 = st.columns(2)
|
| 224 |
+
|
| 225 |
+
with meta_col1:
|
| 226 |
+
# Display document type and languages
|
| 227 |
+
if 'detected_document_type' in result:
|
| 228 |
+
st.write(f"**Document Type:** {result['detected_document_type']}")
|
| 229 |
+
|
| 230 |
+
if 'languages' in result:
|
| 231 |
+
languages = [lang for lang in result['languages'] if lang is not None]
|
| 232 |
+
if languages:
|
| 233 |
+
st.write(f"**Languages:** {', '.join(languages)}")
|
| 234 |
+
|
| 235 |
+
with meta_col2:
|
| 236 |
+
# Display processing time
|
| 237 |
+
if 'processing_time' in result:
|
| 238 |
+
st.write(f"**Processing Time:** {result['processing_time']:.1f}s")
|
| 239 |
+
|
| 240 |
+
# Display page information for PDFs
|
| 241 |
+
if 'limited_pages' in result:
|
| 242 |
+
st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
|
| 243 |
+
|
| 244 |
+
# Display subject tags if available
|
| 245 |
+
if 'topics' in result and result['topics']:
|
| 246 |
+
st.write("**Subject Tags:**")
|
| 247 |
+
# Create a container with flex display for the tags
|
| 248 |
+
st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
|
| 249 |
+
|
| 250 |
+
# Generate a badge for each tag
|
| 251 |
+
for topic in result['topics']:
|
| 252 |
+
# Create colored badge based on tag category
|
| 253 |
+
badge_color = "#546e7a" # Default color
|
| 254 |
+
|
| 255 |
+
# Assign colors by category
|
| 256 |
+
if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
|
| 257 |
+
badge_color = "#1565c0" # Blue for time periods
|
| 258 |
+
elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
|
| 259 |
+
badge_color = "#00695c" # Teal for languages
|
| 260 |
+
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
|
| 261 |
+
badge_color = "#6a1b9a" # Purple for document types
|
| 262 |
+
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
| 263 |
+
badge_color = "#2e7d32" # Green for subject domains
|
| 264 |
+
elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
|
| 265 |
+
badge_color = "#e65100" # Orange for preprocessing-related tags
|
| 266 |
+
|
| 267 |
+
st.markdown(
|
| 268 |
+
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
|
| 269 |
+
f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
|
| 270 |
+
unsafe_allow_html=True
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# Close the container
|
| 274 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 275 |
+
|
| 276 |
+
# Display OCR content
|
| 277 |
+
st.subheader("OCR Content")
|
| 278 |
+
|
| 279 |
+
# Check if we have OCR content
|
| 280 |
+
if 'ocr_contents' in result:
|
| 281 |
+
# Create tabs for different views
|
| 282 |
+
has_images = result.get('has_images', False)
|
| 283 |
+
if has_images:
|
| 284 |
+
content_tab1, content_tab2, content_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
|
| 285 |
+
else:
|
| 286 |
+
content_tab1, content_tab2 = st.tabs(["Structured View", "Raw Text"])
|
| 287 |
+
|
| 288 |
+
with content_tab1:
|
| 289 |
+
# Display structured content
|
| 290 |
+
if isinstance(result['ocr_contents'], dict):
|
| 291 |
+
for section, content in result['ocr_contents'].items():
|
| 292 |
+
if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections
|
| 293 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
| 294 |
+
|
| 295 |
+
if isinstance(content, str):
|
| 296 |
+
st.write(content)
|
| 297 |
+
elif isinstance(content, list):
|
| 298 |
+
for item in content:
|
| 299 |
+
if isinstance(item, str):
|
| 300 |
+
st.write(f"- {item}")
|
| 301 |
+
else:
|
| 302 |
+
st.write(f"- {str(item)}")
|
| 303 |
+
elif isinstance(content, dict):
|
| 304 |
+
for k, v in content.items():
|
| 305 |
+
st.write(f"**{k}:** {v}")
|
| 306 |
+
|
| 307 |
+
with content_tab2:
|
| 308 |
+
# Display raw text with editing capability
|
| 309 |
+
raw_text = ""
|
| 310 |
+
if 'raw_text' in result['ocr_contents']:
|
| 311 |
+
raw_text = result['ocr_contents']['raw_text']
|
| 312 |
+
elif 'content' in result['ocr_contents']:
|
| 313 |
+
raw_text = result['ocr_contents']['content']
|
| 314 |
+
|
| 315 |
+
# Allow editing of the raw text
|
| 316 |
+
edited_text = st.text_area("Edit Raw Text", raw_text, height=400)
|
| 317 |
+
|
| 318 |
+
# Add a button to copy the edited text to clipboard
|
| 319 |
+
if st.button("Copy to Clipboard"):
|
| 320 |
+
st.success("Text copied to clipboard! (You can paste it elsewhere)")
|
| 321 |
+
# Note: The actual clipboard functionality is handled by the browser
|
| 322 |
+
|
| 323 |
+
# Add a download button for the edited text
|
| 324 |
+
st.download_button(
|
| 325 |
+
label="Download Edited Text",
|
| 326 |
+
data=edited_text,
|
| 327 |
+
file_name=f"{result.get('file_name', 'document').split('.')[0]}_edited.txt",
|
| 328 |
+
mime="text/plain"
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
if has_images and 'pages_data' in result:
|
| 332 |
+
with content_tab3:
|
| 333 |
+
# Use the display_document_with_images function
|
| 334 |
+
display_document_with_images(result)
|
| 335 |
+
|
| 336 |
+
# Display custom prompt if provided
|
| 337 |
+
if custom_prompt:
|
| 338 |
+
with st.expander("Custom Processing Instructions"):
|
| 339 |
+
st.write(custom_prompt)
|
| 340 |
+
|
| 341 |
+
# Add download buttons
|
| 342 |
+
st.subheader("Download Results")
|
| 343 |
+
|
| 344 |
+
# Create columns for download buttons
|
| 345 |
+
download_col1, download_col2 = st.columns(2)
|
| 346 |
+
|
| 347 |
+
with download_col1:
|
| 348 |
+
# JSON download
|
| 349 |
+
try:
|
| 350 |
+
json_str = json.dumps(result, indent=2)
|
| 351 |
+
st.download_button(
|
| 352 |
+
label="Download JSON",
|
| 353 |
+
data=json_str,
|
| 354 |
+
file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.json",
|
| 355 |
+
mime="application/json"
|
| 356 |
+
)
|
| 357 |
+
except Exception as e:
|
| 358 |
+
st.error(f"Error creating JSON download: {str(e)}")
|
| 359 |
+
|
| 360 |
+
with download_col2:
|
| 361 |
+
# Text download
|
| 362 |
+
try:
|
| 363 |
+
if 'ocr_contents' in result:
|
| 364 |
+
if 'raw_text' in result['ocr_contents']:
|
| 365 |
+
text_content = result['ocr_contents']['raw_text']
|
| 366 |
+
elif 'content' in result['ocr_contents']:
|
| 367 |
+
text_content = result['ocr_contents']['content']
|
| 368 |
+
else:
|
| 369 |
+
text_content = str(result['ocr_contents'])
|
| 370 |
+
else:
|
| 371 |
+
text_content = "No text content available."
|
| 372 |
+
|
| 373 |
+
st.download_button(
|
| 374 |
+
label="Download Text",
|
| 375 |
+
data=text_content,
|
| 376 |
+
file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.txt",
|
| 377 |
+
mime="text/plain"
|
| 378 |
+
)
|
| 379 |
+
except Exception as e:
|
| 380 |
+
st.error(f"Error creating text download: {str(e)}")
|
| 381 |
+
|
| 382 |
+
def display_document_with_images(result):
|
| 383 |
+
"""Display document with images"""
|
| 384 |
+
if 'pages_data' not in result:
|
| 385 |
+
st.info("No image data available.")
|
| 386 |
+
return
|
| 387 |
+
|
| 388 |
+
# Display each page
|
| 389 |
+
for i, page_data in enumerate(result['pages_data']):
|
| 390 |
+
st.markdown(f"### Page {i+1}")
|
| 391 |
+
|
| 392 |
+
# Create columns for image and text
|
| 393 |
+
img_col, text_col = st.columns([1, 1])
|
| 394 |
+
|
| 395 |
+
with img_col:
|
| 396 |
+
# Display the image
|
| 397 |
+
if 'image_data' in page_data:
|
| 398 |
+
try:
|
| 399 |
+
# Convert base64 to image
|
| 400 |
+
image_data = base64.b64decode(page_data['image_data'])
|
| 401 |
+
st.image(io.BytesIO(image_data), use_column_width=True)
|
| 402 |
+
except Exception as e:
|
| 403 |
+
st.error(f"Error displaying image: {str(e)}")
|
| 404 |
+
else:
|
| 405 |
+
st.info("No image available for this page.")
|
| 406 |
+
|
| 407 |
+
with text_col:
|
| 408 |
+
# Display the text with editing capability
|
| 409 |
+
if 'text' in page_data:
|
| 410 |
+
edited_text = st.text_area(f"Page {i+1} Text", page_data['text'], height=300, key=f"page_text_{i}")
|
| 411 |
+
|
| 412 |
+
# Add a button to copy the edited text to clipboard
|
| 413 |
+
if st.button(f"Copy Page {i+1} Text", key=f"copy_btn_{i}"):
|
| 414 |
+
st.success(f"Page {i+1} text copied to clipboard!")
|
| 415 |
+
else:
|
| 416 |
+
st.info("No text available for this page.")
|
| 417 |
+
|
| 418 |
+
def display_previous_results():
|
| 419 |
+
"""Display previous results tab content"""
|
| 420 |
+
st.markdown('<h2>Previous Results</h2>', unsafe_allow_html=True)
|
| 421 |
+
|
| 422 |
+
# Load custom CSS for Previous Results tab
|
| 423 |
+
try:
|
| 424 |
+
from ui.layout import load_css
|
| 425 |
+
load_css()
|
| 426 |
+
except ImportError:
|
| 427 |
+
# If ui.layout module is not available, use a simplified version
|
| 428 |
+
st.markdown("""
|
| 429 |
+
<style>
|
| 430 |
+
.previous-results-container {
|
| 431 |
+
margin-top: 20px;
|
| 432 |
+
}
|
| 433 |
+
.result-card {
|
| 434 |
+
background-color: #f8f9fa;
|
| 435 |
+
border-radius: 8px;
|
| 436 |
+
padding: 15px;
|
| 437 |
+
margin-bottom: 15px;
|
| 438 |
+
border: 1px solid #e0e0e0;
|
| 439 |
+
}
|
| 440 |
+
.result-header {
|
| 441 |
+
display: flex;
|
| 442 |
+
justify-content: space-between;
|
| 443 |
+
margin-bottom: 10px;
|
| 444 |
+
}
|
| 445 |
+
.result-filename {
|
| 446 |
+
font-weight: bold;
|
| 447 |
+
font-size: 16px;
|
| 448 |
+
}
|
| 449 |
+
.result-date {
|
| 450 |
+
color: #666;
|
| 451 |
+
font-size: 14px;
|
| 452 |
+
}
|
| 453 |
+
.result-metadata {
|
| 454 |
+
margin-top: 10px;
|
| 455 |
+
font-size: 14px;
|
| 456 |
+
}
|
| 457 |
+
.result-tag {
|
| 458 |
+
margin-bottom: 5px;
|
| 459 |
+
color: #555;
|
| 460 |
+
}
|
| 461 |
+
.result-action-button {
|
| 462 |
+
margin-top: 10px;
|
| 463 |
+
text-align: right;
|
| 464 |
+
}
|
| 465 |
+
.selected-result-container {
|
| 466 |
+
margin-top: 30px;
|
| 467 |
+
padding: 20px;
|
| 468 |
+
background-color: #f0f2f6;
|
| 469 |
+
border-radius: 8px;
|
| 470 |
+
}
|
| 471 |
+
.selected-result-title {
|
| 472 |
+
font-size: 18px;
|
| 473 |
+
font-weight: bold;
|
| 474 |
+
}
|
| 475 |
+
</style>
|
| 476 |
+
""", unsafe_allow_html=True)
|
| 477 |
+
|
| 478 |
+
# Display previous results if available
|
| 479 |
+
if not st.session_state.previous_results:
|
| 480 |
+
st.markdown("""
|
| 481 |
+
<div class="previous-results-container" style="text-align: center; padding: 40px 20px; background-color: #f0f2f6; border-radius: 8px;">
|
| 482 |
+
<div style="font-size: 48px; margin-bottom: 20px;">📄</div>
|
| 483 |
+
<h3 style="margin-bottom: 10px; font-weight: 600;">No Previous Results</h3>
|
| 484 |
+
<p style="font-size: 16px;">Process a document to see your results history saved here.</p>
|
| 485 |
+
</div>
|
| 486 |
+
""", unsafe_allow_html=True)
|
| 487 |
+
else:
|
| 488 |
+
# Create a container for the results list
|
| 489 |
+
st.markdown('<div class="previous-results-container">', unsafe_allow_html=True)
|
| 490 |
+
st.markdown(f'<h3>{len(st.session_state.previous_results)} Previous Results</h3>', unsafe_allow_html=True)
|
| 491 |
+
|
| 492 |
+
# Create two columns for filters and download buttons
|
| 493 |
+
filter_col, download_col = st.columns([2, 1])
|
| 494 |
+
|
| 495 |
+
with filter_col:
|
| 496 |
+
# Add filter options
|
| 497 |
+
filter_options = ["All Types"]
|
| 498 |
+
if any(result.get("file_name", "").lower().endswith(".pdf") for result in st.session_state.previous_results):
|
| 499 |
+
filter_options.append("PDF Documents")
|
| 500 |
+
if any(result.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png")) for result in st.session_state.previous_results):
|
| 501 |
+
filter_options.append("Images")
|
| 502 |
+
|
| 503 |
+
selected_filter = st.selectbox("Filter by Type:", filter_options)
|
| 504 |
+
|
| 505 |
+
with download_col:
|
| 506 |
+
# Add download all button for results
|
| 507 |
+
if len(st.session_state.previous_results) > 0:
|
| 508 |
+
try:
|
| 509 |
+
# Create buffer in memory instead of file on disk
|
| 510 |
+
import io
|
| 511 |
+
from ocr_utils import create_results_zip_in_memory
|
| 512 |
+
|
| 513 |
+
# Get zip data directly in memory
|
| 514 |
+
zip_data = create_results_zip_in_memory(st.session_state.previous_results)
|
| 515 |
+
|
| 516 |
+
# Create more informative ZIP filename with timestamp
|
| 517 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 518 |
+
|
| 519 |
+
# Count document types for a more descriptive filename
|
| 520 |
+
pdf_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith('.pdf'))
|
| 521 |
+
img_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
|
| 522 |
+
|
| 523 |
+
# Create more descriptive filename
|
| 524 |
+
if pdf_count > 0 and img_count > 0:
|
| 525 |
+
zip_filename = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
|
| 526 |
+
elif pdf_count > 0:
|
| 527 |
+
zip_filename = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
|
| 528 |
+
elif img_count > 0:
|
| 529 |
+
zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip"
|
| 530 |
+
else:
|
| 531 |
+
zip_filename = f"historical_ocr_results_{timestamp}.zip"
|
| 532 |
+
|
| 533 |
+
st.download_button(
|
| 534 |
+
label="Download All Results",
|
| 535 |
+
data=zip_data,
|
| 536 |
+
file_name=zip_filename,
|
| 537 |
+
mime="application/zip",
|
| 538 |
+
help="Download all previous results as a ZIP file containing HTML and JSON files"
|
| 539 |
+
)
|
| 540 |
+
except Exception as e:
|
| 541 |
+
st.error(f"Error creating download: {str(e)}")
|
| 542 |
+
st.info("Try with fewer results or individual downloads")
|
| 543 |
+
|
| 544 |
+
# Filter results based on selection
|
| 545 |
+
filtered_results = st.session_state.previous_results
|
| 546 |
+
if selected_filter == "PDF Documents":
|
| 547 |
+
filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith(".pdf")]
|
| 548 |
+
elif selected_filter == "Images":
|
| 549 |
+
filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png"))]
|
| 550 |
+
|
| 551 |
+
# Show a message if no results match the filter
|
| 552 |
+
if not filtered_results:
|
| 553 |
+
st.markdown("""
|
| 554 |
+
<div style="text-align: center; padding: 20px; background-color: #f9f9f9; border-radius: 5px; margin: 20px 0;">
|
| 555 |
+
<p>No results match the selected filter.</p>
|
| 556 |
+
</div>
|
| 557 |
+
""", unsafe_allow_html=True)
|
| 558 |
+
|
| 559 |
+
# Display each result as a card
|
| 560 |
+
for i, result in enumerate(filtered_results):
|
| 561 |
+
# Determine file type icon
|
| 562 |
+
file_name = result.get("file_name", f"Document {i+1}")
|
| 563 |
+
file_type_lower = file_name.lower()
|
| 564 |
+
|
| 565 |
+
if file_type_lower.endswith(".pdf"):
|
| 566 |
+
icon = "📄"
|
| 567 |
+
elif file_type_lower.endswith((".jpg", ".jpeg", ".png", ".gif")):
|
| 568 |
+
icon = "🖼️"
|
| 569 |
+
else:
|
| 570 |
+
icon = "📝"
|
| 571 |
+
|
| 572 |
+
# Create a card for each result
|
| 573 |
+
st.markdown(f"""
|
| 574 |
+
<div class="result-card">
|
| 575 |
+
<div class="result-header">
|
| 576 |
+
<div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div>
|
| 577 |
+
<div class="result-date">{result.get('timestamp', 'Unknown')}</div>
|
| 578 |
+
</div>
|
| 579 |
+
<div class="result-metadata">
|
| 580 |
+
<div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
|
| 581 |
+
<div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div>
|
| 582 |
+
</div>
|
| 583 |
+
""", unsafe_allow_html=True)
|
| 584 |
+
|
| 585 |
+
# Add view button inside the card with proper styling
|
| 586 |
+
st.markdown('<div class="result-action-button">', unsafe_allow_html=True)
|
| 587 |
+
if st.button(f"View Document", key=f"view_{i}"):
|
| 588 |
+
# Set the selected result in the session state
|
| 589 |
+
st.session_state.selected_previous_result = st.session_state.previous_results[i]
|
| 590 |
+
# Force a rerun to show the selected result
|
| 591 |
+
st.rerun()
|
| 592 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 593 |
+
|
| 594 |
+
# Close the result card
|
| 595 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 596 |
+
|
| 597 |
+
# Close the container
|
| 598 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 599 |
+
|
| 600 |
+
# Display the selected result if available
|
| 601 |
+
if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
|
| 602 |
+
selected_result = st.session_state.selected_previous_result
|
| 603 |
+
|
| 604 |
+
# Create a styled container for the selected result
|
| 605 |
+
st.markdown(f"""
|
| 606 |
+
<div class="selected-result-container">
|
| 607 |
+
<div class="result-header" style="margin-bottom: 20px;">
|
| 608 |
+
<div class="selected-result-title">Selected Document: {selected_result.get('file_name', 'Unknown')}</div>
|
| 609 |
+
<div class="result-date">{selected_result.get('timestamp', '')}</div>
|
| 610 |
+
</div>
|
| 611 |
+
""", unsafe_allow_html=True)
|
| 612 |
+
|
| 613 |
+
# Display metadata in a styled way
|
| 614 |
+
meta_col1, meta_col2 = st.columns(2)
|
| 615 |
+
|
| 616 |
+
with meta_col1:
|
| 617 |
+
# Display document metadata
|
| 618 |
+
if 'languages' in selected_result:
|
| 619 |
+
languages = [lang for lang in selected_result['languages'] if lang is not None]
|
| 620 |
+
if languages:
|
| 621 |
+
st.write(f"**Languages:** {', '.join(languages)}")
|
| 622 |
+
|
| 623 |
+
if 'topics' in selected_result and selected_result['topics']:
|
| 624 |
+
# Show topics in a more organized way with badges
|
| 625 |
+
st.markdown("**Subject Tags:**")
|
| 626 |
+
# Create a container with flex display for the tags
|
| 627 |
+
st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
|
| 628 |
+
|
| 629 |
+
# Generate a badge for each tag
|
| 630 |
+
for topic in selected_result['topics']:
|
| 631 |
+
# Create colored badge based on tag category
|
| 632 |
+
badge_color = "#546e7a" # Default color
|
| 633 |
+
|
| 634 |
+
# Assign colors by category
|
| 635 |
+
if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
|
| 636 |
+
badge_color = "#1565c0" # Blue for time periods
|
| 637 |
+
elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
|
| 638 |
+
badge_color = "#00695c" # Teal for languages
|
| 639 |
+
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
|
| 640 |
+
badge_color = "#6a1b9a" # Purple for document types
|
| 641 |
+
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
| 642 |
+
badge_color = "#2e7d32" # Green for subject domains
|
| 643 |
+
elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
|
| 644 |
+
badge_color = "#e65100" # Orange for preprocessing-related tags
|
| 645 |
+
|
| 646 |
+
st.markdown(
|
| 647 |
+
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
|
| 648 |
+
f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
|
| 649 |
+
unsafe_allow_html=True
|
| 650 |
+
)
|
| 651 |
+
|
| 652 |
+
# Close the container
|
| 653 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 654 |
+
|
| 655 |
+
with meta_col2:
|
| 656 |
+
# Display processing metadata
|
| 657 |
+
if 'limited_pages' in selected_result:
|
| 658 |
+
st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages")
|
| 659 |
+
|
| 660 |
+
if 'processing_time' in selected_result:
|
| 661 |
+
proc_time = selected_result['processing_time']
|
| 662 |
+
st.write(f"**Processing Time:** {proc_time:.1f}s")
|
| 663 |
+
|
| 664 |
+
# Create tabs for content display
|
| 665 |
+
has_images = selected_result.get('has_images', False)
|
| 666 |
+
if has_images:
|
| 667 |
+
view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
|
| 668 |
+
else:
|
| 669 |
+
view_tab1, view_tab2 = st.tabs(["Structured View", "Raw Text"])
|
| 670 |
+
|
| 671 |
+
with view_tab1:
|
| 672 |
+
# Display structured content
|
| 673 |
+
if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
|
| 674 |
+
for section, content in selected_result['ocr_contents'].items():
|
| 675 |
+
if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections
|
| 676 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
| 677 |
+
|
| 678 |
+
if isinstance(content, str):
|
| 679 |
+
st.write(content)
|
| 680 |
+
elif isinstance(content, list):
|
| 681 |
+
for item in content:
|
| 682 |
+
if isinstance(item, str):
|
| 683 |
+
st.write(f"- {item}")
|
| 684 |
+
else:
|
| 685 |
+
st.write(f"- {str(item)}")
|
| 686 |
+
elif isinstance(content, dict):
|
| 687 |
+
for k, v in content.items():
|
| 688 |
+
st.write(f"**{k}:** {v}")
|
| 689 |
+
|
| 690 |
+
with view_tab2:
|
| 691 |
+
# Display raw text with editing capability
|
| 692 |
+
raw_text = ""
|
| 693 |
+
if 'ocr_contents' in selected_result:
|
| 694 |
+
if 'raw_text' in selected_result['ocr_contents']:
|
| 695 |
+
raw_text = selected_result['ocr_contents']['raw_text']
|
| 696 |
+
elif 'content' in selected_result['ocr_contents']:
|
| 697 |
+
raw_text = selected_result['ocr_contents']['content']
|
| 698 |
+
|
| 699 |
+
# Allow editing of the raw text
|
| 700 |
+
edited_text = st.text_area("Edit Raw Text", raw_text, height=400, key="selected_raw_text")
|
| 701 |
+
|
| 702 |
+
# Add a button to copy the edited text to clipboard
|
| 703 |
+
if st.button("Copy to Clipboard", key="selected_copy_btn"):
|
| 704 |
+
st.success("Text copied to clipboard! (You can paste it elsewhere)")
|
| 705 |
+
|
| 706 |
+
# Add a download button for the edited text
|
| 707 |
+
st.download_button(
|
| 708 |
+
label="Download Edited Text",
|
| 709 |
+
data=edited_text,
|
| 710 |
+
file_name=f"{selected_result.get('file_name', 'document').split('.')[0]}_edited.txt",
|
| 711 |
+
mime="text/plain",
|
| 712 |
+
key="selected_download_btn"
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
+
if has_images and 'pages_data' in selected_result:
|
| 716 |
+
with view_tab3:
|
| 717 |
+
# Use the display_document_with_images function
|
| 718 |
+
display_document_with_images(selected_result)
|
| 719 |
+
|
| 720 |
+
# Close the container
|
| 721 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 722 |
+
|
| 723 |
+
# Add a button to close the selected result
|
| 724 |
+
if st.button("Close Selected Document", key="close_selected"):
|
| 725 |
+
# Clear the selected result from session state
|
| 726 |
+
del st.session_state.selected_previous_result
|
| 727 |
+
# Force a rerun to update the view
|
| 728 |
+
st.rerun()
|
| 729 |
+
|
| 730 |
+
def display_about_tab():
|
| 731 |
+
"""Display about tab content"""
|
| 732 |
+
st.markdown('<h2>About Historical OCR</h2>', unsafe_allow_html=True)
|
| 733 |
+
|
| 734 |
+
# Add app description
|
| 735 |
+
st.markdown("""
|
| 736 |
+
**Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
|
| 737 |
+
|
| 738 |
+
### Purpose
|
| 739 |
+
|
| 740 |
+
This tool is designed to assist scholars in historical research by extracting text from challenging documents.
|
| 741 |
+
While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
|
| 742 |
+
historical documents, particularly:
|
| 743 |
+
|
| 744 |
+
- **Historical newspapers** with complex layouts and aged text
|
| 745 |
+
- **Handwritten documents** from various time periods
|
| 746 |
+
- **Photos of archival materials** that may be difficult to read
|
| 747 |
+
|
| 748 |
+
### Features
|
| 749 |
+
|
| 750 |
+
- **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
|
| 751 |
+
- **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
|
| 752 |
+
- **Editable Results**: Review and edit extracted text directly in the interface
|
| 753 |
+
- **Structured Content Analysis**: Automatic organization of document content
|
| 754 |
+
- **Multi-language Support**: Process documents in various languages
|
| 755 |
+
- **PDF Processing**: Handle multi-page historical documents
|
| 756 |
+
|
| 757 |
+
### How to Use
|
| 758 |
+
|
| 759 |
+
1. Upload a document (PDF or image)
|
| 760 |
+
2. Select the document type and adjust preprocessing options if needed
|
| 761 |
+
3. Add custom processing instructions for specialized documents
|
| 762 |
+
4. Process the document
|
| 763 |
+
5. Review, edit, and download the results
|
| 764 |
+
|
| 765 |
+
### Technologies
|
| 766 |
+
|
| 767 |
+
- OCR processing using Mistral AI's advanced document understanding capabilities
|
| 768 |
+
- Image preprocessing with OpenCV
|
| 769 |
+
- PDF handling with pdf2image
|
| 770 |
+
- Web interface with Streamlit
|
| 771 |
+
""")
|
| 772 |
+
|
| 773 |
+
# Add version information
|
| 774 |
+
st.markdown("**Version:** 1.0.0")
|
utils.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import base64
|
| 3 |
+
import hashlib
|
| 4 |
+
import time
|
| 5 |
+
import logging
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from functools import wraps
|
| 9 |
+
from constants import CONTENT_THEMES, PERIOD_TAGS, DEFAULT_TAGS, GENERIC_TAGS
|
| 10 |
+
|
| 11 |
+
# Configure logging
|
| 12 |
+
logger = logging.getLogger("utils")
|
| 13 |
+
logger.setLevel(logging.INFO)
|
| 14 |
+
|
| 15 |
+
def get_base64_from_image(image_path):
|
| 16 |
+
"""Get base64 string from image file"""
|
| 17 |
+
try:
|
| 18 |
+
with open(image_path, "rb") as img_file:
|
| 19 |
+
return base64.b64encode(img_file.read()).decode('utf-8')
|
| 20 |
+
except Exception as e:
|
| 21 |
+
logger.error(f"Error encoding image to base64: {str(e)}")
|
| 22 |
+
return ""
|
| 23 |
+
|
| 24 |
+
def timing(description):
|
| 25 |
+
"""Context manager for timing code execution"""
|
| 26 |
+
class TimingContext:
|
| 27 |
+
def __init__(self, description):
|
| 28 |
+
self.description = description
|
| 29 |
+
|
| 30 |
+
def __enter__(self):
|
| 31 |
+
self.start_time = time.time()
|
| 32 |
+
return self
|
| 33 |
+
|
| 34 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 35 |
+
end_time = time.time()
|
| 36 |
+
execution_time = end_time - self.start_time
|
| 37 |
+
logger.info(f"{self.description} took {execution_time:.2f} seconds")
|
| 38 |
+
return False
|
| 39 |
+
|
| 40 |
+
return TimingContext(description)
|
| 41 |
+
|
| 42 |
+
def format_timestamp(timestamp=None):
|
| 43 |
+
"""Format timestamp for display"""
|
| 44 |
+
if timestamp is None:
|
| 45 |
+
timestamp = datetime.now()
|
| 46 |
+
elif isinstance(timestamp, str):
|
| 47 |
+
try:
|
| 48 |
+
timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
|
| 49 |
+
except ValueError:
|
| 50 |
+
timestamp = datetime.now()
|
| 51 |
+
|
| 52 |
+
return timestamp.strftime("%Y-%m-%d %H:%M")
|
| 53 |
+
|
| 54 |
+
def generate_cache_key(file_bytes, file_type, use_vision, preprocessing_options=None, pdf_rotation=0, custom_prompt=None):
|
| 55 |
+
"""
|
| 56 |
+
Generate a cache key for OCR processing
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
file_bytes: File content as bytes
|
| 60 |
+
file_type: Type of file (pdf or image)
|
| 61 |
+
use_vision: Whether to use vision model
|
| 62 |
+
preprocessing_options: Dictionary of preprocessing options
|
| 63 |
+
pdf_rotation: PDF rotation value
|
| 64 |
+
custom_prompt: Custom prompt for OCR
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
str: Cache key
|
| 68 |
+
"""
|
| 69 |
+
# Generate file hash
|
| 70 |
+
file_hash = hashlib.md5(file_bytes).hexdigest()
|
| 71 |
+
|
| 72 |
+
# Include preprocessing options in cache key
|
| 73 |
+
preprocessing_options_hash = ""
|
| 74 |
+
if preprocessing_options:
|
| 75 |
+
# Add pdf_rotation to preprocessing options to ensure it's part of the cache key
|
| 76 |
+
if pdf_rotation != 0:
|
| 77 |
+
preprocessing_options_with_rotation = preprocessing_options.copy()
|
| 78 |
+
preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation
|
| 79 |
+
preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
|
| 80 |
+
else:
|
| 81 |
+
preprocessing_str = str(sorted(preprocessing_options.items()))
|
| 82 |
+
preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
|
| 83 |
+
elif pdf_rotation != 0:
|
| 84 |
+
# If no preprocessing options but we have rotation, include that in the hash
|
| 85 |
+
preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation}".encode()).hexdigest()
|
| 86 |
+
|
| 87 |
+
# Create base cache key
|
| 88 |
+
cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
|
| 89 |
+
|
| 90 |
+
# Include custom prompt in cache key if provided
|
| 91 |
+
if custom_prompt:
|
| 92 |
+
custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
|
| 93 |
+
cache_key = f"{cache_key}_{custom_prompt_hash}"
|
| 94 |
+
|
| 95 |
+
return cache_key
|
| 96 |
+
|
| 97 |
+
def handle_temp_files(temp_file_paths):
|
| 98 |
+
"""
|
| 99 |
+
Clean up temporary files
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
temp_file_paths: List of temporary file paths to clean up
|
| 103 |
+
"""
|
| 104 |
+
for temp_path in temp_file_paths:
|
| 105 |
+
try:
|
| 106 |
+
if os.path.exists(temp_path):
|
| 107 |
+
os.unlink(temp_path)
|
| 108 |
+
logger.info(f"Removed temporary file: {temp_path}")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
|
| 111 |
+
|
| 112 |
+
def create_descriptive_filename(original_filename, result, file_ext, preprocessing_options=None):
|
| 113 |
+
"""
|
| 114 |
+
Create a descriptive filename for the result
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
original_filename: Original filename
|
| 118 |
+
result: OCR result dictionary
|
| 119 |
+
file_ext: File extension
|
| 120 |
+
preprocessing_options: Dictionary of preprocessing options
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
str: Descriptive filename
|
| 124 |
+
"""
|
| 125 |
+
# Get base name without extension
|
| 126 |
+
original_name = Path(original_filename).stem
|
| 127 |
+
|
| 128 |
+
# Add document type to filename if detected
|
| 129 |
+
doc_type_tag = ""
|
| 130 |
+
if 'detected_document_type' in result:
|
| 131 |
+
doc_type = result['detected_document_type'].lower()
|
| 132 |
+
doc_type_tag = f"_{doc_type.replace(' ', '_')}"
|
| 133 |
+
elif 'topics' in result and result['topics']:
|
| 134 |
+
# Use first tag as document type if not explicitly detected
|
| 135 |
+
doc_type_tag = f"_{result['topics'][0].lower().replace(' ', '_')}"
|
| 136 |
+
|
| 137 |
+
# Add period tag for historical context if available
|
| 138 |
+
period_tag = ""
|
| 139 |
+
if 'topics' in result and result['topics']:
|
| 140 |
+
for tag in result['topics']:
|
| 141 |
+
if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
|
| 142 |
+
period_tag = f"_{tag.lower().replace(' ', '_')}"
|
| 143 |
+
break
|
| 144 |
+
|
| 145 |
+
# Generate final descriptive filename
|
| 146 |
+
descriptive_name = f"{original_name}{doc_type_tag}{period_tag}{file_ext}"
|
| 147 |
+
return descriptive_name
|
| 148 |
+
|
| 149 |
+
def extract_subject_tags(result, raw_text, preprocessing_options=None):
|
| 150 |
+
"""
|
| 151 |
+
Extract subject tags from OCR result
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
result: OCR result dictionary
|
| 155 |
+
raw_text: Raw text from OCR
|
| 156 |
+
preprocessing_options: Dictionary of preprocessing options
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
list: Subject tags
|
| 160 |
+
"""
|
| 161 |
+
subject_tags = []
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
# Use existing topics as starting point if available
|
| 165 |
+
if 'topics' in result and result['topics']:
|
| 166 |
+
subject_tags = list(result['topics'])
|
| 167 |
+
|
| 168 |
+
# Add document type if detected
|
| 169 |
+
if 'detected_document_type' in result:
|
| 170 |
+
doc_type = result['detected_document_type'].capitalize()
|
| 171 |
+
if doc_type not in subject_tags:
|
| 172 |
+
subject_tags.append(doc_type)
|
| 173 |
+
|
| 174 |
+
# Analyze content for common themes based on keywords
|
| 175 |
+
if raw_text:
|
| 176 |
+
raw_text_lower = raw_text.lower()
|
| 177 |
+
for theme, keywords in CONTENT_THEMES.items():
|
| 178 |
+
if any(keyword in raw_text_lower for keyword in keywords):
|
| 179 |
+
if theme not in subject_tags:
|
| 180 |
+
subject_tags.append(theme)
|
| 181 |
+
|
| 182 |
+
# Add document period tag if date patterns are detected
|
| 183 |
+
if raw_text:
|
| 184 |
+
# Look for years in content
|
| 185 |
+
import re
|
| 186 |
+
year_matches = re.findall(r'\b1[0-9]{3}\b|\b20[0-1][0-9]\b', raw_text)
|
| 187 |
+
if year_matches:
|
| 188 |
+
# Convert to integers
|
| 189 |
+
years = [int(y) for y in year_matches]
|
| 190 |
+
# Get earliest year
|
| 191 |
+
earliest = min(years)
|
| 192 |
+
|
| 193 |
+
# Find the period tag for this year
|
| 194 |
+
for year_range, period_tag in PERIOD_TAGS.items():
|
| 195 |
+
if year_range[0] <= earliest <= year_range[1]:
|
| 196 |
+
if period_tag not in subject_tags:
|
| 197 |
+
subject_tags.append(period_tag)
|
| 198 |
+
break
|
| 199 |
+
|
| 200 |
+
# Add languages as topics if available
|
| 201 |
+
if 'languages' in result and result['languages']:
|
| 202 |
+
for lang in result['languages']:
|
| 203 |
+
if lang and lang not in subject_tags:
|
| 204 |
+
lang_tag = f"{lang} Language"
|
| 205 |
+
subject_tags.append(lang_tag)
|
| 206 |
+
|
| 207 |
+
# Add preprocessing information as tags if preprocessing was applied
|
| 208 |
+
if preprocessing_options:
|
| 209 |
+
preprocessing_methods = []
|
| 210 |
+
if preprocessing_options.get("document_type", "standard") != "standard":
|
| 211 |
+
doc_type = preprocessing_options["document_type"].capitalize()
|
| 212 |
+
preprocessing_tag = f"Enhanced ({doc_type})"
|
| 213 |
+
if preprocessing_tag not in subject_tags:
|
| 214 |
+
subject_tags.append(preprocessing_tag)
|
| 215 |
+
|
| 216 |
+
if preprocessing_options.get("grayscale", False):
|
| 217 |
+
preprocessing_methods.append("Grayscale")
|
| 218 |
+
if preprocessing_options.get("denoise", False):
|
| 219 |
+
preprocessing_methods.append("Denoised")
|
| 220 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
| 221 |
+
contrast_val = preprocessing_options.get("contrast", 0)
|
| 222 |
+
if contrast_val > 0:
|
| 223 |
+
preprocessing_methods.append("Contrast Enhanced")
|
| 224 |
+
else:
|
| 225 |
+
preprocessing_methods.append("Contrast Reduced")
|
| 226 |
+
if preprocessing_options.get("rotation", 0) != 0:
|
| 227 |
+
preprocessing_methods.append("Rotated")
|
| 228 |
+
|
| 229 |
+
# Add a combined preprocessing tag if methods were applied
|
| 230 |
+
if preprocessing_methods:
|
| 231 |
+
prep_tag = "Preprocessed"
|
| 232 |
+
if prep_tag not in subject_tags:
|
| 233 |
+
subject_tags.append(prep_tag)
|
| 234 |
+
|
| 235 |
+
# Add the specific method as a tag if only one was used
|
| 236 |
+
if len(preprocessing_methods) == 1:
|
| 237 |
+
method_tag = preprocessing_methods[0]
|
| 238 |
+
if method_tag not in subject_tags:
|
| 239 |
+
subject_tags.append(method_tag)
|
| 240 |
+
|
| 241 |
+
except Exception as e:
|
| 242 |
+
logger.warning(f"Error generating subject tags: {str(e)}")
|
| 243 |
+
# Fallback tags if extraction fails
|
| 244 |
+
if not subject_tags:
|
| 245 |
+
subject_tags = DEFAULT_TAGS.copy()
|
| 246 |
+
|
| 247 |
+
# Ensure we have at least 3 tags
|
| 248 |
+
while len(subject_tags) < 3:
|
| 249 |
+
for tag in DEFAULT_TAGS:
|
| 250 |
+
if tag not in subject_tags:
|
| 251 |
+
subject_tags.append(tag)
|
| 252 |
+
break
|
| 253 |
+
else:
|
| 254 |
+
# If all default tags are already used, add generic ones
|
| 255 |
+
for tag in GENERIC_TAGS:
|
| 256 |
+
if tag not in subject_tags:
|
| 257 |
+
subject_tags.append(tag)
|
| 258 |
+
break
|
| 259 |
+
else:
|
| 260 |
+
# If we still can't add any more tags, break the loop
|
| 261 |
+
break
|
| 262 |
+
|
| 263 |
+
return subject_tags
|