Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

milwright commited on Apr 23

Commit

7647e70

1 Parent(s): 8d3bfba

Reconcile main with preview-improvements branch, implementing modular structure, raw text editing, and enhanced framing

Browse files

Files changed (9) hide show

app.py +0 -0
constants.py +110 -0
error_handler.py +65 -0
ocr_processing.py +279 -0
preprocessing.py +180 -0
ui/custom.css +222 -335
ui/layout.py +210 -20
ui_components.py +774 -0
utils.py +263 -0

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

constants.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+Constants for the Historical OCR application.
+This module contains all the constants used throughout the application,
+making it easier to maintain and update values in one place.
+"""
+# API limits
+MAX_FILE_SIZE_MB = 50
+MAX_PAGES = 20
+# Caching
+CACHE_TTL_SECONDS = 24 * 3600  # 24 hours
+MAX_CACHE_ENTRIES = 20
+# Image processing
+MAX_IMAGE_DIMENSION = 2500
+IMAGE_QUALITY = 92
+# Document types
+DOCUMENT_TYPES = [
+    "Auto-detect (standard processing)",
+    "Newspaper or Magazine",
+    "Letter or Correspondence",
+    "Book or Publication",
+    "Form or Legal Document",
+    "Recipe",
+    "Handwritten Document",
+    "Map or Illustration",
+    "Table or Spreadsheet",
+    "Other (specify in instructions)"
+]
+# Document layouts
+DOCUMENT_LAYOUTS = [
+    "Standard layout",
+    "Multiple columns",
+    "Table/grid format",
+    "Mixed layout with images"
+]
+# Preprocessing document types
+PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]
+# Rotation options
+ROTATION_OPTIONS = [0, 90, 180, 270]
+# PDF settings
+DEFAULT_PDF_DPI = 100
+MIN_PDF_DPI = 72
+MAX_PDF_DPI = 300
+DEFAULT_MAX_PAGES = 3
+# Performance modes
+PERFORMANCE_MODES = ["Quality", "Speed"]
+# Custom prompt templates
+CUSTOM_PROMPT_TEMPLATES = {
+    "Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
+    "Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
+    "Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
+    "Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
+    "Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
+    "Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
+    "Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
+    "Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
+    "Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
+}
+# Layout prompt additions
+LAYOUT_PROMPT_ADDITIONS = {
+    "Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
+    "Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
+    "Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
+}
+# Content themes for subject tag extraction
+CONTENT_THEMES = {
+    "Historical": ["century", "ancient", "historical", "history", "vintage", "archive", "heritage"],
+    "Travel": ["travel", "journey", "expedition", "exploration", "voyage", "map", "location"],
+    "Science": ["experiment", "research", "study", "analysis", "scientific", "laboratory"],
+    "Literature": ["book", "novel", "poetry", "author", "literary", "chapter", "story"],
+    "Art": ["painting", "illustration", "drawing", "artist", "exhibit", "gallery", "portrait"],
+    "Education": ["education", "school", "university", "college", "learning", "student", "teach"],
+    "Politics": ["government", "political", "policy", "administration", "election", "legislature"],
+    "Business": ["business", "company", "corporation", "market", "industry", "commercial", "trade"],
+    "Social": ["society", "community", "social", "culture", "tradition", "customs"],
+    "Technology": ["technology", "invention", "device", "mechanical", "machine", "technical"],
+    "Military": ["military", "army", "navy", "war", "battle", "soldier", "weapon"],
+    "Religion": ["religion", "church", "temple", "spiritual", "sacred", "ritual"],
+    "Medicine": ["medical", "medicine", "health", "hospital", "treatment", "disease", "doctor"],
+    "Legal": ["legal", "law", "court", "justice", "attorney", "judicial", "statute"],
+    "Correspondence": ["letter", "mail", "correspondence", "message", "communication"]
+}
+# Period tags based on year ranges
+PERIOD_TAGS = {
+    (0, 1799): "Pre-1800s",
+    (1800, 1849): "Early 19th Century",
+    (1850, 1899): "Late 19th Century",
+    (1900, 1949): "Early 20th Century",
+    (1950, 2099): "Modern Era"
+}
+# Default fallback tags
+DEFAULT_TAGS = ["Document", "Historical", "Text"]
+GENERIC_TAGS = ["Archive", "Content", "Record"]
+# UI constants
+PROGRESS_DELAY = 0.8  # Seconds to show completion message

error_handler.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import logging
+import streamlit as st
+import time
+from constants import MAX_FILE_SIZE_MB
+# Configure logging
+logger = logging.getLogger("error_handler")
+logger.setLevel(logging.INFO)
+def handle_ocr_error(exception, progress_reporter=None):
+    """
+    Handle OCR processing errors and provide user-friendly messages
+    Args:
+        exception: The exception that occurred
+        progress_reporter: ProgressReporter instance for UI updates
+    Returns:
+        str: User-friendly error message
+    """
+    error_message = str(exception)
+    # Complete progress reporting if provided
+    if progress_reporter:
+        progress_reporter.complete(success=False)
+    # Check for specific error types and provide helpful user-facing messages
+    if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
+        friendly_message = "The AI service is currently experiencing high demand. Please try again in a few minutes."
+        logger.error(f"Rate limit error: {error_message}")
+        return friendly_message
+    elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
+        friendly_message = "The API usage quota has been reached. Please check your API key and subscription limits."
+        logger.error(f"API quota error: {error_message}")
+        return friendly_message
+    elif "timeout" in error_message.lower() or "timed out" in error_message.lower():
+        friendly_message = "The request timed out. This may be due to a large document or high server load. Please try again or use a smaller document."
+        logger.error(f"Timeout error: {error_message}")
+        return friendly_message
+    elif "file size" in error_message.lower() or "too large" in error_message.lower():
+        friendly_message = f"The file is too large. Maximum file size is {MAX_FILE_SIZE_MB}MB."
+        logger.error(f"File size error: {error_message}")
+        return friendly_message
+    else:
+        # Generic error message for other errors
+        logger.error(f"OCR processing error: {error_message}", exc_info=True)
+        return f"An error occurred during processing: {error_message}"
+def check_file_size(file_bytes):
+    """
+    Check if file size is within limits
+    Args:
+        file_bytes: File content as bytes
+    Returns:
+        tuple: (is_valid, file_size_mb, error_message)
+    """
+    file_size_mb = len(file_bytes) / (1024 * 1024)
+    if file_size_mb > MAX_FILE_SIZE_MB:
+        error_message = f"File size {file_size_mb:.2f} MB exceeds limit of {MAX_FILE_SIZE_MB} MB"
+        return False, file_size_mb, error_message
+    return True, file_size_mb, None

ocr_processing.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import os
+import hashlib
+import tempfile
+import streamlit as st
+import logging
+import time
+from datetime import datetime
+from pathlib import Path
+from structured_ocr import StructuredOCR
+from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
+from preprocessing import apply_preprocessing_to_file
+from error_handler import handle_ocr_error, check_file_size
+# Configure logging
+logger = logging.getLogger("ocr_processing")
+logger.setLevel(logging.INFO)
+@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
+def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None):
+    """
+    Cached version of OCR processing to reuse results
+    Args:
+        file_path: Path to the file to process
+        file_type: Type of file (pdf or image)
+        use_vision: Whether to use vision model
+        file_size_mb: File size in MB
+        cache_key: Cache key for the file
+        preprocessing_options_hash: Hash of preprocessing options
+    Returns:
+        dict: OCR result
+    """
+    # Initialize OCR processor
+    processor = StructuredOCR()
+    # Process the file
+    with timing(f"OCR processing of {file_type} file"):
+        result = processor.process_file(
+            file_path,
+            file_type=file_type,
+            use_vision=use_vision,
+            file_size_mb=file_size_mb
+        )
+    return result
+def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_reporter=None,
+                 pdf_dpi=150, max_pages=3, pdf_rotation=0, custom_prompt=None, perf_mode="Quality"):
+    """
+    Process the uploaded file and return the OCR results
+    Args:
+        uploaded_file: The uploaded file to process
+        use_vision: Whether to use vision model
+        preprocessing_options: Dictionary of preprocessing options
+        progress_reporter: ProgressReporter instance for UI updates
+        pdf_dpi: DPI for PDF conversion
+        max_pages: Maximum number of pages to process
+        pdf_rotation: PDF rotation value
+        custom_prompt: Custom prompt for OCR
+        perf_mode: Performance mode (Quality or Speed)
+    Returns:
+        dict: OCR result
+    """
+    if preprocessing_options is None:
+        preprocessing_options = {}
+    # Create a container for progress indicators if not provided
+    if progress_reporter is None:
+        from ui_components import ProgressReporter
+        progress_reporter = ProgressReporter(st.empty()).setup()
+    # Initialize temporary file paths list
+    temp_file_paths = []
+    try:
+        # Check if file size exceeds maximum allowed size
+        is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
+        if not is_valid:
+            progress_reporter.complete(success=False)
+            st.error(error_message)
+            return {
+                "file_name": uploaded_file.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "error": error_message,
+                "ocr_contents": {
+                    "error": error_message,
+                    "partial_text": "Document could not be processed due to size limitations."
+                }
+            }
+        # Update progress
+        progress_reporter.update(10, "Initializing OCR processor...")
+        # Determine file type from extension
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        file_type = "pdf" if file_ext == ".pdf" else "image"
+        file_bytes = uploaded_file.getvalue()
+        # For PDFs, we need to handle differently
+        if file_type == "pdf":
+            progress_reporter.update(20, "Converting PDF to images...")
+            # Process PDF with direct handling
+            progress_reporter.update(30, "Processing PDF with OCR...")
+            # Create a temporary file for processing
+            temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
+            with open(temp_path, 'wb') as f:
+                f.write(file_bytes)
+            temp_file_paths.append(temp_path)
+            # Generate cache key
+            cache_key = generate_cache_key(
+                file_bytes,
+                file_type,
+                use_vision,
+                preprocessing_options,
+                pdf_rotation,
+                custom_prompt
+            )
+            # Process with cached function if possible
+            try:
+                result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
+                progress_reporter.update(90, "Finalizing results...")
+            except Exception as e:
+                logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
+                progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
+                # If caching fails, process directly
+                processor = StructuredOCR()
+                # Apply performance mode settings
+                if perf_mode == "Speed":
+                    # Override settings for faster processing
+                    if pdf_dpi > 100:
+                        pdf_dpi = 100  # Lower DPI for speed
+                # Process directly with optimized settings
+                result = processor.process_file(
+                    file_path=temp_path,
+                    file_type="pdf",
+                    use_vision=use_vision,
+                    custom_prompt=custom_prompt,
+                    file_size_mb=file_size_mb,
+                    pdf_rotation=pdf_rotation
+                )
+                progress_reporter.update(90, "Finalizing results...")
+        else:
+            # For image files
+            progress_reporter.update(20, "Preparing image for processing...")
+            # Apply preprocessing if needed
+            temp_path, preprocessing_applied = apply_preprocessing_to_file(
+                file_bytes,
+                file_ext,
+                preprocessing_options,
+                temp_file_paths
+            )
+            if preprocessing_applied:
+                progress_reporter.update(30, "Applied image preprocessing...")
+            # Generate cache key
+            cache_key = generate_cache_key(
+                open(temp_path, 'rb').read(),
+                file_type,
+                use_vision,
+                preprocessing_options,
+                0,  # No rotation for images (handled in preprocessing)
+                custom_prompt
+            )
+            # Process the file using cached function if possible
+            progress_reporter.update(50, "Processing document with OCR...")
+            try:
+                result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
+                progress_reporter.update(80, "Analyzing document structure...")
+                progress_reporter.update(90, "Finalizing results...")
+            except Exception as e:
+                logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
+                progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
+                # If caching fails, process directly
+                processor = StructuredOCR()
+                # Apply performance mode settings
+                if perf_mode == "Speed":
+                    # Use simpler processing for speed
+                    pass  # Any speed optimizations would be handled by the StructuredOCR class
+                result = processor.process_file(
+                    file_path=temp_path,
+                    file_type=file_type,
+                    use_vision=use_vision,
+                    custom_prompt=custom_prompt,
+                    file_size_mb=file_size_mb
+                )
+                progress_reporter.update(90, "Finalizing results...")
+        # Add additional metadata to result
+        result = process_result(result, uploaded_file, preprocessing_options)
+        # Complete progress
+        progress_reporter.complete()
+        return result
+    except Exception as e:
+        # Handle errors
+        error_message = handle_ocr_error(e, progress_reporter)
+        # Return error result
+        return {
+            "file_name": uploaded_file.name,
+            "topics": ["Document"],
+            "languages": ["English"],
+            "error": error_message,
+            "ocr_contents": {
+                "error": f"Failed to process file: {error_message}",
+                "partial_text": "Document could not be processed due to an error."
+            }
+        }
+    finally:
+        # Clean up temporary files
+        for temp_path in temp_file_paths:
+            try:
+                if os.path.exists(temp_path):
+                    os.unlink(temp_path)
+                    logger.info(f"Removed temporary file: {temp_path}")
+            except Exception as e:
+                logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
+def process_result(result, uploaded_file, preprocessing_options=None):
+    """
+    Process OCR result to add metadata, tags, etc.
+    Args:
+        result: OCR result dictionary
+        uploaded_file: The uploaded file
+        preprocessing_options: Dictionary of preprocessing options
+    Returns:
+        dict: Processed OCR result
+    """
+    # Add timestamp
+    result['timestamp'] = format_timestamp()
+    # Add processing time if not already present
+    if 'processing_time' not in result:
+        result['processing_time'] = 0.0
+    # Generate descriptive filename
+    file_ext = Path(uploaded_file.name).suffix.lower()
+    result['descriptive_file_name'] = create_descriptive_filename(
+        uploaded_file.name,
+        result,
+        file_ext,
+        preprocessing_options
+    )
+    # Extract raw text from OCR contents
+    raw_text = ""
+    if 'ocr_contents' in result:
+        if 'raw_text' in result['ocr_contents']:
+            raw_text = result['ocr_contents']['raw_text']
+        elif 'content' in result['ocr_contents']:
+            raw_text = result['ocr_contents']['content']
+    # Extract subject tags if not already present or enhance existing ones
+    if 'topics' not in result or not result['topics']:
+        result['topics'] = extract_subject_tags(result, raw_text, preprocessing_options)
+    return result

preprocessing.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os
+import io
+import cv2
+import numpy as np
+import tempfile
+from PIL import Image, ImageEnhance, ImageFilter
+from pdf2image import convert_from_bytes
+import streamlit as st
+import logging
+# Configure logging
+logger = logging.getLogger("preprocessing")
+logger.setLevel(logging.INFO)
+@st.cache_data(ttl=24*3600, show_spinner=False)  # Cache for 24 hours
+def convert_pdf_to_images(pdf_bytes, dpi=150, rotation=0):
+    """Convert PDF bytes to a list of images with caching"""
+    try:
+        images = convert_from_bytes(pdf_bytes, dpi=dpi)
+        # Apply rotation if specified
+        if rotation != 0 and images:
+            rotated_images = []
+            for img in images:
+                rotated_img = img.rotate(rotation, expand=True, resample=Image.BICUBIC)
+                rotated_images.append(rotated_img)
+            return rotated_images
+        return images
+    except Exception as e:
+        st.error(f"Error converting PDF: {str(e)}")
+        logger.error(f"PDF conversion error: {str(e)}")
+        return []
+@st.cache_data(ttl=24*3600, show_spinner=False, hash_funcs={dict: lambda x: str(sorted(x.items()))})
+def preprocess_image(image_bytes, preprocessing_options):
+    """Preprocess image with selected options optimized for historical document OCR quality"""
+    # Setup basic console logging
+    logger = logging.getLogger("image_preprocessor")
+    logger.setLevel(logging.INFO)
+    # Log which preprocessing options are being applied
+    logger.info(f"Preprocessing image with options: {preprocessing_options}")
+    # Convert bytes to PIL Image
+    image = Image.open(io.BytesIO(image_bytes))
+    # Check for alpha channel (RGBA) and convert to RGB if needed
+    if image.mode == 'RGBA':
+        # Convert RGBA to RGB by compositing the image onto a white background
+        background = Image.new('RGB', image.size, (255, 255, 255))
+        background.paste(image, mask=image.split()[3])  # 3 is the alpha channel
+        image = background
+        logger.info("Converted RGBA image to RGB")
+    elif image.mode not in ('RGB', 'L'):
+        # Convert other modes to RGB as well
+        image = image.convert('RGB')
+        logger.info(f"Converted {image.mode} image to RGB")
+    # Apply rotation if specified
+    if preprocessing_options.get("rotation", 0) != 0:
+        rotation_degrees = preprocessing_options.get("rotation")
+        image = image.rotate(rotation_degrees, expand=True, resample=Image.BICUBIC)
+    # Resize large images while preserving details important for OCR
+    width, height = image.size
+    max_dimension = max(width, height)
+    # Less aggressive resizing to preserve document details
+    if max_dimension > 2500:
+        scale_factor = 2500 / max_dimension
+        new_width = int(width * scale_factor)
+        new_height = int(height * scale_factor)
+        # Use LANCZOS for better quality preservation
+        image = image.resize((new_width, new_height), Image.LANCZOS)
+    img_array = np.array(image)
+    # Apply preprocessing based on selected options with settings optimized for historical documents
+    document_type = preprocessing_options.get("document_type", "standard")
+    # Process grayscale option first as it's a common foundation
+    if preprocessing_options.get("grayscale", False):
+        if len(img_array.shape) == 3:  # Only convert if it's not already grayscale
+            if document_type == "handwritten":
+                # Enhanced grayscale processing for handwritten documents
+                img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+                # Apply adaptive histogram equalization to enhance handwriting
+                clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+                img_array = clahe.apply(img_array)
+            else:
+                # Standard grayscale for printed documents
+                img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            # Convert back to RGB for further processing
+            img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+    if preprocessing_options.get("contrast", 0) != 0:
+        contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 100)
+        image = Image.fromarray(img_array)
+        enhancer = ImageEnhance.Contrast(image)
+        image = enhancer.enhance(contrast_factor)
+        img_array = np.array(image)
+    if preprocessing_options.get("denoise", False):
+        try:
+            # Apply appropriate denoising based on document type
+            if document_type == "handwritten":
+                # Very light denoising for handwritten documents to preserve pen strokes
+                if len(img_array.shape) == 3 and img_array.shape[2] == 3:  # Color image
+                    img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 3, 3, 5, 9)
+                else:  # Grayscale image
+                    img_array = cv2.fastNlMeansDenoising(img_array, None, 3, 7, 21)
+            else:
+                # Standard denoising for printed documents
+                if len(img_array.shape) == 3 and img_array.shape[2] == 3:  # Color image
+                    img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 5, 5, 7, 21)
+                else:  # Grayscale image
+                    img_array = cv2.fastNlMeansDenoising(img_array, None, 5, 7, 21)
+        except Exception as e:
+            logger.error(f"Denoising error: {str(e)}, falling back to standard processing")
+    # Convert back to PIL Image
+    processed_image = Image.fromarray(img_array)
+    # Higher quality for OCR processing
+    byte_io = io.BytesIO()
+    try:
+        # Make sure the image is in RGB mode before saving as JPEG
+        if processed_image.mode not in ('RGB', 'L'):
+            processed_image = processed_image.convert('RGB')
+        processed_image.save(byte_io, format='JPEG', quality=92, optimize=True)
+        byte_io.seek(0)
+        logger.info(f"Preprocessing complete. Original image mode: {image.mode}, processed mode: {processed_image.mode}")
+        logger.info(f"Original size: {len(image_bytes)/1024:.1f}KB, processed size: {len(byte_io.getvalue())/1024:.1f}KB")
+        return byte_io.getvalue()
+    except Exception as e:
+        logger.error(f"Error saving processed image: {str(e)}")
+        # Fallback to original image
+        logger.info("Using original image as fallback")
+        image_io = io.BytesIO()
+        image.save(image_io, format='JPEG', quality=92)
+        image_io.seek(0)
+        return image_io.getvalue()
+def create_temp_file(content, suffix, temp_file_paths):
+    """Create a temporary file and track it for cleanup"""
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        tmp.write(content)
+        temp_path = tmp.name
+        # Track temporary file for cleanup
+        temp_file_paths.append(temp_path)
+        logger.info(f"Created temporary file: {temp_path}")
+        return temp_path
+def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, temp_file_paths):
+    """Apply preprocessing to file and return path to processed file"""
+    # Check if any preprocessing options with boolean values are True, or if any non-boolean values are non-default
+    has_preprocessing = (
+        preprocessing_options.get("grayscale", False) or
+        preprocessing_options.get("denoise", False) or
+        preprocessing_options.get("contrast", 0) != 0 or
+        preprocessing_options.get("rotation", 0) != 0 or
+        preprocessing_options.get("document_type", "standard") != "standard"
+    )
+    if has_preprocessing:
+        # Apply preprocessing
+        processed_bytes = preprocess_image(file_bytes, preprocessing_options)
+        # Save processed image to temp file
+        temp_path = create_temp_file(processed_bytes, file_ext, temp_file_paths)
+        return temp_path, True  # Return path and flag indicating preprocessing was applied
+    else:
+        # No preprocessing needed, just save the original file
+        temp_path = create_temp_file(file_bytes, file_ext, temp_file_paths)
+        return temp_path, False  # Return path and flag indicating no preprocessing was applied

ui/custom.css CHANGED Viewed

@@ -1,395 +1,282 @@
-/* Minimal essential styling */
-/* Processing status container */
-.processing-status-container {
-    margin: 10px 0;
-    padding: 8px 12px;
-    border-left: 3px solid #5c6bc0;
-    font-size: 0.9rem;
 }
-/* Result card styling */
-.previous-results-container {
-    margin-bottom: 20px;
-    color: #000000 !important;
-    background-color: #ffffff !important;
 }
-/* Let Streamlit handle text colors based on background */
-/* Reset forced white text to use Streamlit defaults */
-.result-card {
     border: 1px solid #e0e0e0;
     border-radius: 4px;
     padding: 15px;
     margin-bottom: 15px;
 }
 .result-header {
     display: flex;
     justify-content: space-between;
     margin-bottom: 10px;
-    padding-bottom: 5px;
-    border-bottom: 1px solid #e0e0e0;
 }
 .result-filename {
     font-weight: bold;
-    font-size: 1.1rem;
 }
 .result-date {
-    font-size: 0.9rem;
     color: #666;
 }
 .result-metadata {
-    display: flex;
-    flex-wrap: wrap;
-    gap: 8px;
-    margin-bottom: 10px;
 }
 .result-tag {
-    background-color: #e3f2fd;
-    border-radius: 16px;
-    padding: 3px 10px;
-    font-size: 0.85rem;
-    color: #1565c0;
 }
 .selected-result-container {
-    border: 1px solid #e0e0e0;
-    border-radius: 4px;
     padding: 20px;
-    margin: 15px 0;
 }
 .selected-result-title {
-    font-size: 1.3rem;
     font-weight: bold;
-    margin-bottom: 15px;
 }
-/* Fix for image preprocessing preview */
-.stExpander {
-    overflow: hidden !important;
-    margin-bottom: 10px !important;
 }
-.stExpander img {
-    max-width: 100% !important;
-    height: auto !important;
-    object-fit: contain !important;
 }
-/* Additional fixes for image preprocessing preview in expanders */
-.streamlit-expanderContent {
-    overflow: hidden !important;
-    padding-top: 5px !important;
-    padding-bottom: 5px !important;
 }
-.streamlit-expanderContent img {
-    max-width: 95% !important;
-    height: auto !important;
-    object-fit: contain !important;
 }
-/* Compact sidebar expanders */
-.stSidebar .stExpander {
-    margin-top: 0 !important;
-    margin-bottom: 8px !important;
 }
-.stSidebar .streamlit-expanderHeader {
-    font-size: 0.9em !important;
-    padding: 5px !important;
 }
-.stSidebar .streamlit-expanderContent {
-    padding: 5px !important;
 }
-/* Better sidebar section spacing */
-.stSidebar h1, .stSidebar h2, .stSidebar h3, .stSidebar h4, .stSidebar h5 {
-    margin-top: 15px !important;
-    margin-bottom: 5px !important;
-    padding-top: 0 !important;
-    padding-bottom: 3px !important;
-    line-height: 1.2 !important;
-    font-weight: 600 !important;
 }
-/* First heading in sidebar doesn't need top margin */
-.stSidebar [data-testid="stVerticalBlock"] > div:first-child h5 {
-    margin-top: 0 !important;
 }
-/* Optimize sidebar checkbox positioning */
-.stSidebar .stCheckbox > div > div {
-    margin-bottom: 3px !important;
 }
-/* Metadata container styling */
-.metadata-container {
-    background-color: #f8f9fa;
-    border-radius: 4px;
-    padding: 12px;
-    margin-bottom: 20px;
-    margin-top: -10px !important; /* Negative margin to reduce gap with header */
-    border-left: 3px solid #4285f4;
-}
-/* Direct child styling to prevent nested containers */
-.element-container > .metadata-container {
-    margin-top: 0 !important;
-}
-/* Fix spacing for headings above metadata container */
-.element-container h3 + div .metadata-container,
-.element-container h1 + div .metadata-container,
-.element-container h2 + div .metadata-container,
-.stHeading + div div {
-    margin-top: 0 !important;
-}
-/* Fix for subheader and metadata container spacing */
-.stHeading ~ div {
-    margin-top: -10px !important;
-}
-/* Remove excess space between metadata heading and content */
-.stMarkdown + div div.element-container,
-.stMarkdown + div,
-.stHeading + div,
-.stHeading + div div.element-container,
-header + div.stMarkdown + div,
-[data-testid="stHeader"] + div,
-.stHeading + * {
-    margin-top: 0 !important;
-    padding-top: 0 !important;
-}
-/* PDF container fixes */
-.stExpander .streamlit-expanderContent {
-    max-width: 100% !important;
-    overflow: visible !important;
-}
-/* Fix placement of fullscreen buttons, especially in expanders */
-.element-container .stImage .stExpander button[title="View fullscreen"] {
-    position: absolute !important;
-    top: 5px !important;
-    right: 5px !important;
-}
-/* Fix PDF preview container */
-.stPdfViewerContent,
-.stPdfViewer,
-.stPdfViewerPagesContainer {
-    width: 100% !important;
-    max-width: 100% !important;
-    overflow: visible !important;
-}
-/* Fix for expandable content */
-.stExpander > div[data-testid="stExpander"] {
-    max-width: 100% !important;
-    overflow: visible !important;
-}
-/* Fix positioning for fullscreen buttons in image containers */
-.stImage button[title="View fullscreen"] {
-    position: absolute !important;
-    top: 5px !important;
-    right: 5px !important;
-    z-index: 1000 !important;
-    visibility: visible !important;
-    opacity: 1 !important;
-    width: 28px !important;
-    height: 28px !important;
-    padding: 0 !important;
-    margin: 0 !important;
-    background-color: rgba(255, 255, 255, 0.7) !important;
-    border-radius: 4px !important;
-    display: flex !important;
-    align-items: center !important;
-    justify-content: center !important;
-}
-/* Fix fullscreen button styling */
-button[title="View fullscreen"],
-button.streamlit-expanderHeader {
-    z-index: 999 !important;
-    visibility: visible !important;
-    opacity: 1 !important;
-    border-radius: 4px !important;
-    position: absolute !important;
-    top: 5px !important;
-    right: 5px !important;
-    width: 28px !important;
-    height: 28px !important;
-    padding: 0 !important;
-    margin: 0 !important;
-    background-color: rgba(255, 255, 255, 0.7) !important;
-    display: flex !important;
-    align-items: center !important;
-    justify-content: center !important;
-}
-/* Make text visible in Previous Results tab - ensure high contrast */
-.previous-results-container h3,
-.previous-results-container p,
-.previous-results-container .result-filename,
-.previous-results-container .result-date,
-.previous-results-container .result-tag {
-    color: #000000 !important;
-    text-shadow: none !important;
-}
-/* No Results styling with proper contrast */
-.previous-results-container[style*="text-align: center"] {
-    background-color: #f0f2f6 !important;
-    border-radius: 8px !important;
-    box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
-}
-/* Additional image fixes for all containers */
-.document-content img,
-.markdown-text-container img,
-.page-text-content img,
-.image-container img,
-.streamlit-expanderContent img {
-    max-width: 100% !important;
-    height: auto !important;
-    object-fit: contain !important;
-}
-/* Responsive design rules */
-/* Specific rules for mobile/small screens */
-@media (max-width: 768px) {
-    .stExpander img,
-    .document-content img,
-    .markdown-text-container img,
-    .page-text-content img,
-    .image-container img,
-    .streamlit-expanderContent img {
-        max-width: 95% !important;
-    }
-    /* Improve responsive layout for example documents */
-    .stImage,
-    .css-6qob1r,
-    .css-zq5wmm,
-    .css-fg4pbf,
-    [data-testid="column"],
-    [data-testid="stHorizontalBlock"] > div {
-        margin-bottom: 20px !important;
-        padding: 0 10px !important;
-    }
-    .stImage img {
-        width: 100% !important;
-        max-width: 100% !important;
-        height: auto !important;
-        object-fit: contain !important;
-    }
-    .stColumnContainer,
-    .css-jjjwqm,
-    .css-fg4pbf,
-    [data-testid="column"] {
-        gap: 20px !important;
-        margin-bottom: 20px !important;
-    }
-    /* Force separate columns on mid-sized screens */
-    [data-testid="stHorizontalBlock"] {
-        flex-wrap: wrap !important;
-    }
-    [data-testid="stHorizontalBlock"] > div {
-        min-width: 45% !important;
-        flex: 1 1 45% !important;
-    }
-}
-/* Modern Streamlit styling - better responsive behavior */
-.block-container {
-    padding-top: 2rem !important;
-    padding-bottom: 2rem !important;
-}
-/* Specific rules for very small screens (mobile) */
-@media (max-width: 640px) {
-    /* Force single column on very small screens */
-    .row-widget.stHorizontal > div,
-    div[data-testid="stHorizontalBlock"] > div {
-        flex-direction: column !important;
-        width: 100% !important;
-    }
-    /* Critical fix for column display to prevent overlapping */
-    [data-testid="column"] {
-        width: 100% !important;
-        flex: 1 1 100% !important;
-        padding: 0 !important;
-        min-width: 100% !important;
-        max-width: 100% !important;
-        float: none !important;
-        clear: both !important;
-        display: block !important;
-    }
-    /* Enforce correct column layout for Streamlit's container elements */
-    div[data-testid="stHorizontalBlock"] {
-        flex-direction: column !important;
-        display: block !important;
-    }
-    /* Make images more visible on small screens */
-    .row-widget.stImage img,
-    [data-testid="stImage"] > img {
-        max-width: 100% !important;
-        width: 100% !important;
-        margin-bottom: 15px !important;
-    }
-    /* Fix example documents grid layout */
-    .stImage {
-        display: block !important;
-        margin-left: auto !important;
-        margin-right: auto !important;
-        width: 100% !important;
-    }
-}
-/* Fix image display in grid layout */
-.row-widget.stImage,
-.css-z5fcl4 {
-    text-align: center !important;
-    margin-bottom: 15px !important;
-    padding: 0 !important;
-}
-.row-widget.stImage img,
-.css-z5fcl4 img {
-    max-height: 250px !important;
-    object-fit: contain !important;
-    border-radius: 4px !important;
-    border: 1px solid rgba(0, 0, 0, 0.1) !important;
-    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
-}
-/* Better caption styling for images */
-.css-z5fcl4 .caption,
-.caption,
-[data-testid="caption"],
-.css-1b0udgb,
-.css-183lzff {
-    margin-top: 5px !important;
-    font-weight: 500 !important;
-    text-align: center !important;
-    font-size: 0.9rem !important;
-}

+/* Custom CSS for Historical OCR Application */
+/* Global styles */
+body {
+    font-family: 'Source Sans Pro', sans-serif;
+    color: #333;
 }
+/* Header styles */
+h1, h2, h3, h4, h5, h6 {
+    font-family: 'Georgia', serif;
+    font-weight: 600;
+    color: #1E3A8A;
 }
+/* Raw text editor styling */
+.raw-text-editor {
+    font-family: 'Courier New', monospace;
+    font-size: 14px;
+    line-height: 1.5;
+    border: 1px solid #ddd;
+    border-radius: 4px;
+    padding: 10px;
+    background-color: #f9f9f9;
+}
+/* Document content styling */
+.document-content {
+    margin-top: 20px;
+}
+.document-section {
+    margin-bottom: 20px;
+    padding: 15px;
+    background-color: #fff;
+    border-radius: 8px;
     border: 1px solid #e0e0e0;
+}
+.document-section h4 {
+    margin-top: 0;
+    margin-bottom: 10px;
+    color: #1E3A8A;
+}
+/* Subject tag styling */
+.subject-tag {
+    display: inline-block;
+    padding: 3px 8px;
+    border-radius: 12px;
+    font-size: 0.85em;
+    margin-right: 5px;
+    margin-bottom: 5px;
+    color: white;
+}
+.tag-time-period {
+    background-color: #1565c0;
+}
+.tag-language {
+    background-color: #00695c;
+}
+.tag-document-type {
+    background-color: #6a1b9a;
+}
+.tag-subject {
+    background-color: #2e7d32;
+}
+.tag-preprocessing {
+    background-color: #e65100;
+}
+.tag-default {
+    background-color: #546e7a;
+}
+/* Image and text side-by-side styling */
+.image-text-container {
+    display: flex;
+    gap: 20px;
+    margin-bottom: 20px;
+}
+.image-container {
+    flex: 1;
+}
+.text-container {
+    flex: 1;
+}
+/* Sidebar styling */
+.sidebar-section {
+    margin-bottom: 20px;
+}
+.sidebar-section h3 {
+    margin-top: 0;
+    margin-bottom: 10px;
+    font-size: 16px;
+}
+/* Button styling */
+.primary-button {
+    background-color: #1E88E5;
+    color: white;
+    border: none;
     border-radius: 4px;
+    padding: 8px 16px;
+    font-weight: 600;
+    cursor: pointer;
+    transition: background-color 0.2s;
+}
+.primary-button:hover {
+    background-color: #1565C0;
+}
+.secondary-button {
+    background-color: #f8f9fa;
+    color: #333;
+    border: 1px solid #ddd;
+    border-radius: 4px;
+    padding: 8px 16px;
+    font-weight: 600;
+    cursor: pointer;
+    transition: background-color 0.2s;
+}
+.secondary-button:hover {
+    background-color: #e9ecef;
+}
+/* Processing status styling */
+.processing-status {
+    padding: 10px 15px;
+    border-left: 4px solid #1E88E5;
+    background-color: #E3F2FD;
+    border-radius: 0 4px 4px 0;
+    margin: 10px 0;
+    font-size: 14px;
+}
+/* Previous results styling */
+.previous-results-container {
+    margin-top: 20px;
+}
+.result-card {
+    background-color: #f8f9fa;
+    border-radius: 8px;
     padding: 15px;
     margin-bottom: 15px;
+    border: 1px solid #e0e0e0;
+    transition: all 0.2s ease;
+}
+.result-card:hover {
+    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
+    border-color: #c0c0c0;
 }
 .result-header {
     display: flex;
     justify-content: space-between;
     margin-bottom: 10px;
 }
 .result-filename {
     font-weight: bold;
+    font-size: 16px;
 }
 .result-date {
     color: #666;
+    font-size: 14px;
 }
 .result-metadata {
+    margin-top: 10px;
+    font-size: 14px;
 }
 .result-tag {
+    margin-bottom: 5px;
+    color: #555;
+}
+.result-action-button {
+    margin-top: 10px;
+    text-align: right;
 }
 .selected-result-container {
+    margin-top: 30px;
     padding: 20px;
+    background-color: #f0f2f6;
+    border-radius: 8px;
+    border: 1px solid #d0d7de;
 }
 .selected-result-title {
+    font-size: 18px;
     font-weight: bold;
+    color: #1E3A8A;
 }
+/* About tab styling */
+.about-section {
+    margin-bottom: 30px;
 }
+.about-section h3 {
+    color: #1E3A8A;
+    margin-bottom: 10px;
 }
+.feature-list {
+    list-style-type: none;
+    padding-left: 0;
 }
+.feature-list li {
+    margin-bottom: 8px;
+    padding-left: 20px;
+    position: relative;
 }
+.feature-list li:before {
+    content: "•";
+    position: absolute;
+    left: 0;
+    color: #1E88E5;
 }
+/* File uploader styling */
+.file-uploader {
+    border: 2px dashed #ddd;
+    border-radius: 8px;
+    padding: 20px;
+    text-align: center;
+    transition: border-color 0.2s;
 }
+.file-uploader:hover {
+    border-color: #1E88E5;
 }
+/* Example documents styling */
+.example-documents {
+    margin-top: 20px;
 }
+.example-card {
+    background-color: #f8f9fa;
+    border-radius: 8px;
+    padding: 15px;
+    margin-bottom: 15px;
+    border: 1px solid #e0e0e0;
+    cursor: pointer;
+    transition: all 0.2s ease;
 }
+.example-card:hover {
+    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
+    border-color: #c0c0c0;
 }
+.example-title {
+    font-weight: bold;
+    font-size: 16px;
+    margin-bottom: 5px;
+}
+.example-description {
+    font-size: 14px;
+    color: #555;
+}

ui/layout.py CHANGED Viewed

@@ -1,27 +1,217 @@
-"""
-UI layout components for the OCR application.
-"""
-import os
 import streamlit as st
-from pathlib import Path
 def load_css():
-    """Load custom CSS for the application."""
-    # Get the directory of the current file
-    current_dir = Path(os.path.dirname(os.path.abspath(__file__)))
-    # Path to the CSS file
-    css_file = current_dir / "custom.css"
-    # Check if the file exists
-    if not css_file.exists():
-        st.warning(f"Custom CSS file not found at {css_file}")
-        return
-    # Read the CSS content
-    with open(css_file) as f:
-        css_content = f.read()
-    # Apply the CSS
-    st.markdown(f"<style>{css_content}</style>", unsafe_allow_html=True)

 import streamlit as st
 def load_css():
+    """Load custom CSS for the application"""
+    st.markdown("""
+    <style>
+    /* Global styles */
+    body {
+        font-family: 'Source Sans Pro', sans-serif;
+        color: #333;
+    }
+    /* Header styles */
+    h1, h2, h3, h4, h5, h6 {
+        font-family: 'Georgia', serif;
+        font-weight: 600;
+        color: #1E3A8A;
+    }
+    /* Processing status container */
+    .processing-status-container {
+        padding: 10px 15px;
+        border-left: 4px solid #1E88E5;
+        background-color: #E3F2FD;
+        border-radius: 0 4px 4px 0;
+        margin: 10px 0;
+        font-size: 14px;
+    }
+    /* Previous results styling */
+    .previous-results-container {
+        margin-top: 20px;
+    }
+    .result-card {
+        background-color: #f8f9fa;
+        border-radius: 8px;
+        padding: 15px;
+        margin-bottom: 15px;
+        border: 1px solid #e0e0e0;
+        transition: all 0.2s ease;
+    }
+    .result-card:hover {
+        box-shadow: 0 4px 8px rgba(0,0,0,0.1);
+        border-color: #c0c0c0;
+    }
+    .result-header {
+        display: flex;
+        justify-content: space-between;
+        margin-bottom: 10px;
+    }
+    .result-filename {
+        font-weight: bold;
+        font-size: 16px;
+    }
+    .result-date {
+        color: #666;
+        font-size: 14px;
+    }
+    .result-metadata {
+        margin-top: 10px;
+        font-size: 14px;
+    }
+    .result-tag {
+        margin-bottom: 5px;
+        color: #555;
+    }
+    .result-action-button {
+        margin-top: 10px;
+        text-align: right;
+    }
+    .selected-result-container {
+        margin-top: 30px;
+        padding: 20px;
+        background-color: #f0f2f6;
+        border-radius: 8px;
+        border: 1px solid #d0d7de;
+    }
+    .selected-result-title {
+        font-size: 18px;
+        font-weight: bold;
+        color: #1E3A8A;
+    }
+    /* Raw text editor styling */
+    .stTextArea textarea {
+        font-family: 'Courier New', monospace;
+        font-size: 14px;
+        line-height: 1.5;
+    }
+    /* Image and text side-by-side styling */
+    .image-text-container {
+        display: flex;
+        gap: 20px;
+        margin-bottom: 20px;
+    }
+    .image-container {
+        flex: 1;
+    }
+    .text-container {
+        flex: 1;
+    }
+    /* Sidebar styling */
+    .sidebar .stRadio > div {
+        flex-direction: row;
+    }
+    .sidebar .stRadio label {
+        margin-right: 10px;
+    }
+    /* Optimize spacing in sidebar */
+    .sidebar .block-container {
+        padding-top: 0;
+    }
+    .sidebar [data-testid="stVerticalBlock"] {
+        gap: 0;
+    }
+    /* Button styling */
+    .stButton > button {
+        border-radius: 4px;
+        font-weight: 600;
+    }
+    /* File uploader styling */
+    .stFileUploader > section > div {
+        min-height: 100px;
+    }
+    /* Reset vertical text in file uploader */
+    .stFileUploader p,
+    .stFileUploader span,
+    .stFileUploader div p,
+    .stFileUploader div span,
+    .stFileUploader label p,
+    .stFileUploader label span,
+    .stFileUploader div[data-testid="stFileUploadDropzone"] p,
+    .stFileUploader div[data-testid="stFileUploadDropzone"] span {
+        writing-mode: horizontal-tb !important;
+    }
+    /* Metadata styling */
+    .metadata-card {
+        background-color: #f8f9fa;
+        border-radius: 8px;
+        padding: 15px;
+        margin-bottom: 20px;
+        border: 1px solid #e0e0e0;
+    }
+    /* Document content styling */
+    .document-content {
+        margin-top: 10px;
+    }
+    /* Tab styling */
+    .stTabs [data-baseweb="tab-list"] {
+        gap: 8px;
+    }
+    .stTabs [data-baseweb="tab"] {
+        padding: 8px 16px;
+        border-radius: 4px 4px 0 0;
+    }
+    /* Success message styling */
+    .stSuccess {
+        background-color: #D4EDDA;
+        color: #155724;
+        padding: 10px;
+        border-radius: 4px;
+        border-left: 5px solid #155724;
+    }
+    /* Error message styling */
+    .stError {
+        background-color: #F8D7DA;
+        color: #721C24;
+        padding: 10px;
+        border-radius: 4px;
+        border-left: 5px solid #721C24;
+    }
+    /* Info message styling */
+    .stInfo {
+        background-color: #D1ECF1;
+        color: #0C5460;
+        padding: 10px;
+        border-radius: 4px;
+        border-left: 5px solid #0C5460;
+    }
+    /* Warning message styling */
+    .stWarning {
+        background-color: #FFF3CD;
+        color: #856404;
+        padding: 10px;
+        border-radius: 4px;
+        border-left: 5px solid #856404;
+    }
+    </style>
+    """, unsafe_allow_html=True)

ui_components.py ADDED Viewed

	@@ -0,0 +1,774 @@

+import streamlit as st
+import os
+import io
+import base64
+from datetime import datetime
+from pathlib import Path
+import json
+from constants import (
+    DOCUMENT_TYPES,
+    DOCUMENT_LAYOUTS,
+    CUSTOM_PROMPT_TEMPLATES,
+    LAYOUT_PROMPT_ADDITIONS,
+    DEFAULT_PDF_DPI,
+    MIN_PDF_DPI,
+    MAX_PDF_DPI,
+    DEFAULT_MAX_PAGES,
+    PERFORMANCE_MODES,
+    PREPROCESSING_DOC_TYPES,
+    ROTATION_OPTIONS
+)
+from utils import get_base64_from_image, extract_subject_tags
+class ProgressReporter:
+    """Class to handle progress reporting in the UI"""
+    def __init__(self, placeholder):
+        self.placeholder = placeholder
+        self.progress_bar = None
+        self.status_text = None
+    def setup(self):
+        """Setup the progress components"""
+        with self.placeholder.container():
+            self.progress_bar = st.progress(0)
+            self.status_text = st.empty()
+        return self
+    def update(self, percent, status_text):
+        """Update the progress bar and status text"""
+        if self.progress_bar is not None:
+            self.progress_bar.progress(percent / 100)
+        if self.status_text is not None:
+            self.status_text.text(status_text)
+    def complete(self, success=True):
+        """Complete the progress reporting"""
+        if success:
+            if self.progress_bar is not None:
+                self.progress_bar.progress(100)
+            if self.status_text is not None:
+                self.status_text.text("Processing complete!")
+        else:
+            if self.status_text is not None:
+                self.status_text.text("Processing failed.")
+        # Clear the progress components after a delay
+        import time
+        time.sleep(0.8)  # Short delay to show completion
+        if self.progress_bar is not None:
+            self.progress_bar.empty()
+        if self.status_text is not None:
+            self.status_text.empty()
+def create_sidebar_options():
+    """Create and return sidebar options"""
+    with st.sidebar:
+        st.title("OCR Settings")
+        # Create a container for the sidebar options
+        with st.container():
+            # Model selection
+            st.subheader("Model Selection")
+            use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
+            # Performance mode
+            perf_mode = st.radio("Performance Mode", PERFORMANCE_MODES,
+                                horizontal=True,
+                                help="Quality: Best results but slower. Speed: Faster but may be less accurate.")
+            # Document type selection
+            st.subheader("Document Type")
+            doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
+                                   help="Select the type of document you're processing for better results")
+            # Document layout
+            doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS,
+                                     help="Select the layout of your document")
+            # Custom prompt
+            custom_prompt = ""
+            if doc_type != DOCUMENT_TYPES[0]:  # Not auto-detect
+                # Get the template for the selected document type
+                prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "")
+                # Add layout information if not standard
+                if doc_layout != DOCUMENT_LAYOUTS[0]:  # Not standard layout
+                    layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "")
+                    if layout_addition:
+                        prompt_template += " " + layout_addition
+                # Set the custom prompt
+                custom_prompt = prompt_template
+                # Allow user to edit the prompt
+                st.markdown("**Custom Processing Instructions**")
+                custom_prompt = st.text_area("", value=custom_prompt,
+                                           help="Customize the instructions for processing this document",
+                                           height=100)
+            # Image preprocessing options
+            st.subheader("Image Preprocessing")
+            # Document type for preprocessing
+            preprocessing_doc_type = st.radio("Document Type",
+                                            PREPROCESSING_DOC_TYPES,
+                                            horizontal=True,
+                                            help="Select the type of document for preprocessing")
+            # Grayscale conversion
+            grayscale = st.checkbox("Convert to Grayscale",
+                                  value=False,
+                                  help="Convert color images to grayscale for better OCR")
+            # Denoise
+            denoise = st.checkbox("Denoise Image",
+                                value=False,
+                                help="Remove noise from the image")
+            # Contrast adjustment
+            contrast = st.slider("Contrast Adjustment",
+                               min_value=-50,
+                               max_value=50,
+                               value=0,
+                               step=10,
+                               help="Adjust image contrast")
+            # Rotation
+            rotation = st.slider("Rotation",
+                               min_value=-45,
+                               max_value=45,
+                               value=0,
+                               step=5,
+                               help="Rotate image if needed")
+            # Create preprocessing options dictionary
+            preprocessing_options = {
+                "document_type": preprocessing_doc_type,
+                "grayscale": grayscale,
+                "denoise": denoise,
+                "contrast": contrast,
+                "rotation": rotation
+            }
+            # PDF-specific options
+            st.subheader("PDF Options")
+            pdf_dpi = st.slider("PDF Resolution (DPI)",
+                               min_value=MIN_PDF_DPI,
+                               max_value=MAX_PDF_DPI,
+                               value=DEFAULT_PDF_DPI,
+                               step=25,
+                               help="Higher DPI gives better quality but slower processing")
+            max_pages = st.number_input("Maximum Pages to Process",
+                                      min_value=1,
+                                      max_value=20,
+                                      value=DEFAULT_MAX_PAGES,
+                                      help="Limit the number of pages to process (for multi-page PDFs)")
+            pdf_rotation = st.radio("PDF Rotation", ROTATION_OPTIONS,
+                                  horizontal=True,
+                                  format_func=lambda x: f"{x}°",
+                                  help="Rotate PDF pages if needed")
+            # Create options dictionary
+            options = {
+                "use_vision": use_vision,
+                "perf_mode": perf_mode,
+                "pdf_dpi": pdf_dpi,
+                "max_pages": max_pages,
+                "pdf_rotation": pdf_rotation,
+                "custom_prompt": custom_prompt,
+                "preprocessing_options": preprocessing_options
+            }
+            return options
+def create_file_uploader():
+    """Create and return a file uploader"""
+    # Add app description
+    favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
+    favicon_base64 = get_base64_from_image(favicon_path)
+    st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><img src="data:image/png;base64,{favicon_base64}" width="36" height="36" alt="Scroll Icon"/> <div><h1 style="margin: 0; padding: 20px 0 0 0;">Historical Document OCR</h1></div></div>', unsafe_allow_html=True)
+    st.subheader("Made possible by Mistral AI")
+    # Add project framing
+    st.markdown("""
+    This tool is designed to assist scholars in historical research by extracting text from challenging documents.
+    While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
+    historical documents, particularly:
+    - **Historical newspapers** with complex layouts and aged text
+    - **Handwritten documents** from various time periods
+    - **Photos of archival materials** that may be difficult to read
+    Upload a document to get started, or explore the example documents.
+    """)
+    # Create file uploader
+    uploaded_file = st.file_uploader(
+        "Upload a document",
+        type=["pdf", "png", "jpg", "jpeg"],
+        help="Upload a PDF or image file for OCR processing"
+    )
+    return uploaded_file
+def display_results(result, container, custom_prompt=""):
+    """Display OCR results in the provided container"""
+    with container:
+        # Display document metadata
+        st.subheader("Document Metadata")
+        # Create columns for metadata
+        meta_col1, meta_col2 = st.columns(2)
+        with meta_col1:
+            # Display document type and languages
+            if 'detected_document_type' in result:
+                st.write(f"**Document Type:** {result['detected_document_type']}")
+            if 'languages' in result:
+                languages = [lang for lang in result['languages'] if lang is not None]
+                if languages:
+                    st.write(f"**Languages:** {', '.join(languages)}")
+        with meta_col2:
+            # Display processing time
+            if 'processing_time' in result:
+                st.write(f"**Processing Time:** {result['processing_time']:.1f}s")
+            # Display page information for PDFs
+            if 'limited_pages' in result:
+                st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
+        # Display subject tags if available
+        if 'topics' in result and result['topics']:
+            st.write("**Subject Tags:**")
+            # Create a container with flex display for the tags
+            st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
+            # Generate a badge for each tag
+            for topic in result['topics']:
+                # Create colored badge based on tag category
+                badge_color = "#546e7a"  # Default color
+                # Assign colors by category
+                if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
+                    badge_color = "#1565c0"  # Blue for time periods
+                elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
+                    badge_color = "#00695c"  # Teal for languages
+                elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
+                    badge_color = "#6a1b9a"  # Purple for document types
+                elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
+                    badge_color = "#2e7d32"  # Green for subject domains
+                elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
+                    badge_color = "#e65100"  # Orange for preprocessing-related tags
+                st.markdown(
+                    f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
+                    f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
+                    unsafe_allow_html=True
+                )
+            # Close the container
+            st.markdown('</div>', unsafe_allow_html=True)
+        # Display OCR content
+        st.subheader("OCR Content")
+        # Check if we have OCR content
+        if 'ocr_contents' in result:
+            # Create tabs for different views
+            has_images = result.get('has_images', False)
+            if has_images:
+                content_tab1, content_tab2, content_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
+            else:
+                content_tab1, content_tab2 = st.tabs(["Structured View", "Raw Text"])
+            with content_tab1:
+                # Display structured content
+                if isinstance(result['ocr_contents'], dict):
+                    for section, content in result['ocr_contents'].items():
+                        if content and section not in ['error', 'raw_text', 'partial_text']:  # Skip error and raw text sections
+                            st.markdown(f"#### {section.replace('_', ' ').title()}")
+                            if isinstance(content, str):
+                                st.write(content)
+                            elif isinstance(content, list):
+                                for item in content:
+                                    if isinstance(item, str):
+                                        st.write(f"- {item}")
+                                    else:
+                                        st.write(f"- {str(item)}")
+                            elif isinstance(content, dict):
+                                for k, v in content.items():
+                                    st.write(f"**{k}:** {v}")
+            with content_tab2:
+                # Display raw text with editing capability
+                raw_text = ""
+                if 'raw_text' in result['ocr_contents']:
+                    raw_text = result['ocr_contents']['raw_text']
+                elif 'content' in result['ocr_contents']:
+                    raw_text = result['ocr_contents']['content']
+                # Allow editing of the raw text
+                edited_text = st.text_area("Edit Raw Text", raw_text, height=400)
+                # Add a button to copy the edited text to clipboard
+                if st.button("Copy to Clipboard"):
+                    st.success("Text copied to clipboard! (You can paste it elsewhere)")
+                    # Note: The actual clipboard functionality is handled by the browser
+                # Add a download button for the edited text
+                st.download_button(
+                    label="Download Edited Text",
+                    data=edited_text,
+                    file_name=f"{result.get('file_name', 'document').split('.')[0]}_edited.txt",
+                    mime="text/plain"
+                )
+            if has_images and 'pages_data' in result:
+                with content_tab3:
+                    # Use the display_document_with_images function
+                    display_document_with_images(result)
+        # Display custom prompt if provided
+        if custom_prompt:
+            with st.expander("Custom Processing Instructions"):
+                st.write(custom_prompt)
+        # Add download buttons
+        st.subheader("Download Results")
+        # Create columns for download buttons
+        download_col1, download_col2 = st.columns(2)
+        with download_col1:
+            # JSON download
+            try:
+                json_str = json.dumps(result, indent=2)
+                st.download_button(
+                    label="Download JSON",
+                    data=json_str,
+                    file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.json",
+                    mime="application/json"
+                )
+            except Exception as e:
+                st.error(f"Error creating JSON download: {str(e)}")
+        with download_col2:
+            # Text download
+            try:
+                if 'ocr_contents' in result:
+                    if 'raw_text' in result['ocr_contents']:
+                        text_content = result['ocr_contents']['raw_text']
+                    elif 'content' in result['ocr_contents']:
+                        text_content = result['ocr_contents']['content']
+                    else:
+                        text_content = str(result['ocr_contents'])
+                else:
+                    text_content = "No text content available."
+                st.download_button(
+                    label="Download Text",
+                    data=text_content,
+                    file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.txt",
+                    mime="text/plain"
+                )
+            except Exception as e:
+                st.error(f"Error creating text download: {str(e)}")
+def display_document_with_images(result):
+    """Display document with images"""
+    if 'pages_data' not in result:
+        st.info("No image data available.")
+        return
+    # Display each page
+    for i, page_data in enumerate(result['pages_data']):
+        st.markdown(f"### Page {i+1}")
+        # Create columns for image and text
+        img_col, text_col = st.columns([1, 1])
+        with img_col:
+            # Display the image
+            if 'image_data' in page_data:
+                try:
+                    # Convert base64 to image
+                    image_data = base64.b64decode(page_data['image_data'])
+                    st.image(io.BytesIO(image_data), use_column_width=True)
+                except Exception as e:
+                    st.error(f"Error displaying image: {str(e)}")
+            else:
+                st.info("No image available for this page.")
+        with text_col:
+            # Display the text with editing capability
+            if 'text' in page_data:
+                edited_text = st.text_area(f"Page {i+1} Text", page_data['text'], height=300, key=f"page_text_{i}")
+                # Add a button to copy the edited text to clipboard
+                if st.button(f"Copy Page {i+1} Text", key=f"copy_btn_{i}"):
+                    st.success(f"Page {i+1} text copied to clipboard!")
+            else:
+                st.info("No text available for this page.")
+def display_previous_results():
+    """Display previous results tab content"""
+    st.markdown('<h2>Previous Results</h2>', unsafe_allow_html=True)
+    # Load custom CSS for Previous Results tab
+    try:
+        from ui.layout import load_css
+        load_css()
+    except ImportError:
+        # If ui.layout module is not available, use a simplified version
+        st.markdown("""
+        <style>
+        .previous-results-container {
+            margin-top: 20px;
+        }
+        .result-card {
+            background-color: #f8f9fa;
+            border-radius: 8px;
+            padding: 15px;
+            margin-bottom: 15px;
+            border: 1px solid #e0e0e0;
+        }
+        .result-header {
+            display: flex;
+            justify-content: space-between;
+            margin-bottom: 10px;
+        }
+        .result-filename {
+            font-weight: bold;
+            font-size: 16px;
+        }
+        .result-date {
+            color: #666;
+            font-size: 14px;
+        }
+        .result-metadata {
+            margin-top: 10px;
+            font-size: 14px;
+        }
+        .result-tag {
+            margin-bottom: 5px;
+            color: #555;
+        }
+        .result-action-button {
+            margin-top: 10px;
+            text-align: right;
+        }
+        .selected-result-container {
+            margin-top: 30px;
+            padding: 20px;
+            background-color: #f0f2f6;
+            border-radius: 8px;
+        }
+        .selected-result-title {
+            font-size: 18px;
+            font-weight: bold;
+        }
+        </style>
+        """, unsafe_allow_html=True)
+    # Display previous results if available
+    if not st.session_state.previous_results:
+        st.markdown("""
+        <div class="previous-results-container" style="text-align: center; padding: 40px 20px; background-color: #f0f2f6; border-radius: 8px;">
+            <div style="font-size: 48px; margin-bottom: 20px;">📄</div>
+            <h3 style="margin-bottom: 10px; font-weight: 600;">No Previous Results</h3>
+            <p style="font-size: 16px;">Process a document to see your results history saved here.</p>
+        </div>
+        """, unsafe_allow_html=True)
+    else:
+        # Create a container for the results list
+        st.markdown('<div class="previous-results-container">', unsafe_allow_html=True)
+        st.markdown(f'<h3>{len(st.session_state.previous_results)} Previous Results</h3>', unsafe_allow_html=True)
+        # Create two columns for filters and download buttons
+        filter_col, download_col = st.columns([2, 1])
+        with filter_col:
+            # Add filter options
+            filter_options = ["All Types"]
+            if any(result.get("file_name", "").lower().endswith(".pdf") for result in st.session_state.previous_results):
+                filter_options.append("PDF Documents")
+            if any(result.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png")) for result in st.session_state.previous_results):
+                filter_options.append("Images")
+            selected_filter = st.selectbox("Filter by Type:", filter_options)
+        with download_col:
+            # Add download all button for results
+            if len(st.session_state.previous_results) > 0:
+                try:
+                    # Create buffer in memory instead of file on disk
+                    import io
+                    from ocr_utils import create_results_zip_in_memory
+                    # Get zip data directly in memory
+                    zip_data = create_results_zip_in_memory(st.session_state.previous_results)
+                    # Create more informative ZIP filename with timestamp
+                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                    # Count document types for a more descriptive filename
+                    pdf_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith('.pdf'))
+                    img_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
+                    # Create more descriptive filename
+                    if pdf_count > 0 and img_count > 0:
+                        zip_filename = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
+                    elif pdf_count > 0:
+                        zip_filename = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
+                    elif img_count > 0:
+                        zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip"
+                    else:
+                        zip_filename = f"historical_ocr_results_{timestamp}.zip"
+                    st.download_button(
+                        label="Download All Results",
+                        data=zip_data,
+                        file_name=zip_filename,
+                        mime="application/zip",
+                        help="Download all previous results as a ZIP file containing HTML and JSON files"
+                    )
+                except Exception as e:
+                    st.error(f"Error creating download: {str(e)}")
+                    st.info("Try with fewer results or individual downloads")
+        # Filter results based on selection
+        filtered_results = st.session_state.previous_results
+        if selected_filter == "PDF Documents":
+            filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith(".pdf")]
+        elif selected_filter == "Images":
+            filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png"))]
+        # Show a message if no results match the filter
+        if not filtered_results:
+            st.markdown("""
+            <div style="text-align: center; padding: 20px; background-color: #f9f9f9; border-radius: 5px; margin: 20px 0;">
+                <p>No results match the selected filter.</p>
+            </div>
+            """, unsafe_allow_html=True)
+        # Display each result as a card
+        for i, result in enumerate(filtered_results):
+            # Determine file type icon
+            file_name = result.get("file_name", f"Document {i+1}")
+            file_type_lower = file_name.lower()
+            if file_type_lower.endswith(".pdf"):
+                icon = "📄"
+            elif file_type_lower.endswith((".jpg", ".jpeg", ".png", ".gif")):
+                icon = "🖼️"
+            else:
+                icon = "📝"
+            # Create a card for each result
+            st.markdown(f"""
+            <div class="result-card">
+                <div class="result-header">
+                    <div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div>
+                    <div class="result-date">{result.get('timestamp', 'Unknown')}</div>
+                </div>
+                <div class="result-metadata">
+                    <div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
+                    <div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div>
+                </div>
+            """, unsafe_allow_html=True)
+            # Add view button inside the card with proper styling
+            st.markdown('<div class="result-action-button">', unsafe_allow_html=True)
+            if st.button(f"View Document", key=f"view_{i}"):
+                # Set the selected result in the session state
+                st.session_state.selected_previous_result = st.session_state.previous_results[i]
+                # Force a rerun to show the selected result
+                st.rerun()
+            st.markdown('</div>', unsafe_allow_html=True)
+            # Close the result card
+            st.markdown('</div>', unsafe_allow_html=True)
+        # Close the container
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Display the selected result if available
+        if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
+            selected_result = st.session_state.selected_previous_result
+            # Create a styled container for the selected result
+            st.markdown(f"""
+            <div class="selected-result-container">
+                <div class="result-header" style="margin-bottom: 20px;">
+                    <div class="selected-result-title">Selected Document: {selected_result.get('file_name', 'Unknown')}</div>
+                    <div class="result-date">{selected_result.get('timestamp', '')}</div>
+                </div>
+            """, unsafe_allow_html=True)
+            # Display metadata in a styled way
+            meta_col1, meta_col2 = st.columns(2)
+            with meta_col1:
+                # Display document metadata
+                if 'languages' in selected_result:
+                    languages = [lang for lang in selected_result['languages'] if lang is not None]
+                    if languages:
+                        st.write(f"**Languages:** {', '.join(languages)}")
+                if 'topics' in selected_result and selected_result['topics']:
+                    # Show topics in a more organized way with badges
+                    st.markdown("**Subject Tags:**")
+                    # Create a container with flex display for the tags
+                    st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
+                    # Generate a badge for each tag
+                    for topic in selected_result['topics']:
+                        # Create colored badge based on tag category
+                        badge_color = "#546e7a"  # Default color
+                        # Assign colors by category
+                        if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
+                            badge_color = "#1565c0"  # Blue for time periods
+                        elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
+                            badge_color = "#00695c"  # Teal for languages
+                        elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
+                            badge_color = "#6a1b9a"  # Purple for document types
+                        elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
+                            badge_color = "#2e7d32"  # Green for subject domains
+                        elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
+                            badge_color = "#e65100"  # Orange for preprocessing-related tags
+                        st.markdown(
+                            f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
+                            f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
+                            unsafe_allow_html=True
+                        )
+                    # Close the container
+                    st.markdown('</div>', unsafe_allow_html=True)
+            with meta_col2:
+                # Display processing metadata
+                if 'limited_pages' in selected_result:
+                    st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages")
+                if 'processing_time' in selected_result:
+                    proc_time = selected_result['processing_time']
+                    st.write(f"**Processing Time:** {proc_time:.1f}s")
+            # Create tabs for content display
+            has_images = selected_result.get('has_images', False)
+            if has_images:
+                view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
+            else:
+                view_tab1, view_tab2 = st.tabs(["Structured View", "Raw Text"])
+            with view_tab1:
+                # Display structured content
+                if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
+                    for section, content in selected_result['ocr_contents'].items():
+                        if content and section not in ['error', 'raw_text', 'partial_text']:  # Skip error and raw text sections
+                            st.markdown(f"#### {section.replace('_', ' ').title()}")
+                            if isinstance(content, str):
+                                st.write(content)
+                            elif isinstance(content, list):
+                                for item in content:
+                                    if isinstance(item, str):
+                                        st.write(f"- {item}")
+                                    else:
+                                        st.write(f"- {str(item)}")
+                            elif isinstance(content, dict):
+                                for k, v in content.items():
+                                    st.write(f"**{k}:** {v}")
+            with view_tab2:
+                # Display raw text with editing capability
+                raw_text = ""
+                if 'ocr_contents' in selected_result:
+                    if 'raw_text' in selected_result['ocr_contents']:
+                        raw_text = selected_result['ocr_contents']['raw_text']
+                    elif 'content' in selected_result['ocr_contents']:
+                        raw_text = selected_result['ocr_contents']['content']
+                # Allow editing of the raw text
+                edited_text = st.text_area("Edit Raw Text", raw_text, height=400, key="selected_raw_text")
+                # Add a button to copy the edited text to clipboard
+                if st.button("Copy to Clipboard", key="selected_copy_btn"):
+                    st.success("Text copied to clipboard! (You can paste it elsewhere)")
+                # Add a download button for the edited text
+                st.download_button(
+                    label="Download Edited Text",
+                    data=edited_text,
+                    file_name=f"{selected_result.get('file_name', 'document').split('.')[0]}_edited.txt",
+                    mime="text/plain",
+                    key="selected_download_btn"
+                )
+            if has_images and 'pages_data' in selected_result:
+                with view_tab3:
+                    # Use the display_document_with_images function
+                    display_document_with_images(selected_result)
+            # Close the container
+            st.markdown('</div>', unsafe_allow_html=True)
+            # Add a button to close the selected result
+            if st.button("Close Selected Document", key="close_selected"):
+                # Clear the selected result from session state
+                del st.session_state.selected_previous_result
+                # Force a rerun to update the view
+                st.rerun()
+def display_about_tab():
+    """Display about tab content"""
+    st.markdown('<h2>About Historical OCR</h2>', unsafe_allow_html=True)
+    # Add app description
+    st.markdown("""
+    **Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
+    ### Purpose
+    This tool is designed to assist scholars in historical research by extracting text from challenging documents.
+    While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
+    historical documents, particularly:
+    - **Historical newspapers** with complex layouts and aged text
+    - **Handwritten documents** from various time periods
+    - **Photos of archival materials** that may be difficult to read
+    ### Features
+    - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
+    - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
+    - **Editable Results**: Review and edit extracted text directly in the interface
+    - **Structured Content Analysis**: Automatic organization of document content
+    - **Multi-language Support**: Process documents in various languages
+    - **PDF Processing**: Handle multi-page historical documents
+    ### How to Use
+    1. Upload a document (PDF or image)
+    2. Select the document type and adjust preprocessing options if needed
+    3. Add custom processing instructions for specialized documents
+    4. Process the document
+    5. Review, edit, and download the results
+    ### Technologies
+    - OCR processing using Mistral AI's advanced document understanding capabilities
+    - Image preprocessing with OpenCV
+    - PDF handling with pdf2image
+    - Web interface with Streamlit
+    """)
+    # Add version information
+    st.markdown("**Version:** 1.0.0")

utils.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import os
+import base64
+import hashlib
+import time
+import logging
+from datetime import datetime
+from pathlib import Path
+from functools import wraps
+from constants import CONTENT_THEMES, PERIOD_TAGS, DEFAULT_TAGS, GENERIC_TAGS
+# Configure logging
+logger = logging.getLogger("utils")
+logger.setLevel(logging.INFO)
+def get_base64_from_image(image_path):
+    """Get base64 string from image file"""
+    try:
+        with open(image_path, "rb") as img_file:
+            return base64.b64encode(img_file.read()).decode('utf-8')
+    except Exception as e:
+        logger.error(f"Error encoding image to base64: {str(e)}")
+        return ""
+def timing(description):
+    """Context manager for timing code execution"""
+    class TimingContext:
+        def __init__(self, description):
+            self.description = description
+        def __enter__(self):
+            self.start_time = time.time()
+            return self
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            end_time = time.time()
+            execution_time = end_time - self.start_time
+            logger.info(f"{self.description} took {execution_time:.2f} seconds")
+            return False
+    return TimingContext(description)
+def format_timestamp(timestamp=None):
+    """Format timestamp for display"""
+    if timestamp is None:
+        timestamp = datetime.now()
+    elif isinstance(timestamp, str):
+        try:
+            timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
+        except ValueError:
+            timestamp = datetime.now()
+    return timestamp.strftime("%Y-%m-%d %H:%M")
+def generate_cache_key(file_bytes, file_type, use_vision, preprocessing_options=None, pdf_rotation=0, custom_prompt=None):
+    """
+    Generate a cache key for OCR processing
+    Args:
+        file_bytes: File content as bytes
+        file_type: Type of file (pdf or image)
+        use_vision: Whether to use vision model
+        preprocessing_options: Dictionary of preprocessing options
+        pdf_rotation: PDF rotation value
+        custom_prompt: Custom prompt for OCR
+    Returns:
+        str: Cache key
+    """
+    # Generate file hash
+    file_hash = hashlib.md5(file_bytes).hexdigest()
+    # Include preprocessing options in cache key
+    preprocessing_options_hash = ""
+    if preprocessing_options:
+        # Add pdf_rotation to preprocessing options to ensure it's part of the cache key
+        if pdf_rotation != 0:
+            preprocessing_options_with_rotation = preprocessing_options.copy()
+            preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation
+            preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
+        else:
+            preprocessing_str = str(sorted(preprocessing_options.items()))
+        preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
+    elif pdf_rotation != 0:
+        # If no preprocessing options but we have rotation, include that in the hash
+        preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation}".encode()).hexdigest()
+    # Create base cache key
+    cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
+    # Include custom prompt in cache key if provided
+    if custom_prompt:
+        custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
+        cache_key = f"{cache_key}_{custom_prompt_hash}"
+    return cache_key
+def handle_temp_files(temp_file_paths):
+    """
+    Clean up temporary files
+    Args:
+        temp_file_paths: List of temporary file paths to clean up
+    """
+    for temp_path in temp_file_paths:
+        try:
+            if os.path.exists(temp_path):
+                os.unlink(temp_path)
+                logger.info(f"Removed temporary file: {temp_path}")
+        except Exception as e:
+            logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
+def create_descriptive_filename(original_filename, result, file_ext, preprocessing_options=None):
+    """
+    Create a descriptive filename for the result
+    Args:
+        original_filename: Original filename
+        result: OCR result dictionary
+        file_ext: File extension
+        preprocessing_options: Dictionary of preprocessing options
+    Returns:
+        str: Descriptive filename
+    """
+    # Get base name without extension
+    original_name = Path(original_filename).stem
+    # Add document type to filename if detected
+    doc_type_tag = ""
+    if 'detected_document_type' in result:
+        doc_type = result['detected_document_type'].lower()
+        doc_type_tag = f"_{doc_type.replace(' ', '_')}"
+    elif 'topics' in result and result['topics']:
+        # Use first tag as document type if not explicitly detected
+        doc_type_tag = f"_{result['topics'][0].lower().replace(' ', '_')}"
+    # Add period tag for historical context if available
+    period_tag = ""
+    if 'topics' in result and result['topics']:
+        for tag in result['topics']:
+            if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
+                period_tag = f"_{tag.lower().replace(' ', '_')}"
+                break
+    # Generate final descriptive filename
+    descriptive_name = f"{original_name}{doc_type_tag}{period_tag}{file_ext}"
+    return descriptive_name
+def extract_subject_tags(result, raw_text, preprocessing_options=None):
+    """
+    Extract subject tags from OCR result
+    Args:
+        result: OCR result dictionary
+        raw_text: Raw text from OCR
+        preprocessing_options: Dictionary of preprocessing options
+    Returns:
+        list: Subject tags
+    """
+    subject_tags = []
+    try:
+        # Use existing topics as starting point if available
+        if 'topics' in result and result['topics']:
+            subject_tags = list(result['topics'])
+        # Add document type if detected
+        if 'detected_document_type' in result:
+            doc_type = result['detected_document_type'].capitalize()
+            if doc_type not in subject_tags:
+                subject_tags.append(doc_type)
+        # Analyze content for common themes based on keywords
+        if raw_text:
+            raw_text_lower = raw_text.lower()
+            for theme, keywords in CONTENT_THEMES.items():
+                if any(keyword in raw_text_lower for keyword in keywords):
+                    if theme not in subject_tags:
+                        subject_tags.append(theme)
+        # Add document period tag if date patterns are detected
+        if raw_text:
+            # Look for years in content
+            import re
+            year_matches = re.findall(r'\b1[0-9]{3}\b|\b20[0-1][0-9]\b', raw_text)
+            if year_matches:
+                # Convert to integers
+                years = [int(y) for y in year_matches]
+                # Get earliest year
+                earliest = min(years)
+                # Find the period tag for this year
+                for year_range, period_tag in PERIOD_TAGS.items():
+                    if year_range[0] <= earliest <= year_range[1]:
+                        if period_tag not in subject_tags:
+                            subject_tags.append(period_tag)
+                        break
+        # Add languages as topics if available
+        if 'languages' in result and result['languages']:
+            for lang in result['languages']:
+                if lang and lang not in subject_tags:
+                    lang_tag = f"{lang} Language"
+                    subject_tags.append(lang_tag)
+        # Add preprocessing information as tags if preprocessing was applied
+        if preprocessing_options:
+            preprocessing_methods = []
+            if preprocessing_options.get("document_type", "standard") != "standard":
+                doc_type = preprocessing_options["document_type"].capitalize()
+                preprocessing_tag = f"Enhanced ({doc_type})"
+                if preprocessing_tag not in subject_tags:
+                    subject_tags.append(preprocessing_tag)
+            if preprocessing_options.get("grayscale", False):
+                preprocessing_methods.append("Grayscale")
+            if preprocessing_options.get("denoise", False):
+                preprocessing_methods.append("Denoised")
+            if preprocessing_options.get("contrast", 0) != 0:
+                contrast_val = preprocessing_options.get("contrast", 0)
+                if contrast_val > 0:
+                    preprocessing_methods.append("Contrast Enhanced")
+                else:
+                    preprocessing_methods.append("Contrast Reduced")
+            if preprocessing_options.get("rotation", 0) != 0:
+                preprocessing_methods.append("Rotated")
+            # Add a combined preprocessing tag if methods were applied
+            if preprocessing_methods:
+                prep_tag = "Preprocessed"
+                if prep_tag not in subject_tags:
+                    subject_tags.append(prep_tag)
+                # Add the specific method as a tag if only one was used
+                if len(preprocessing_methods) == 1:
+                    method_tag = preprocessing_methods[0]
+                    if method_tag not in subject_tags:
+                        subject_tags.append(method_tag)
+    except Exception as e:
+        logger.warning(f"Error generating subject tags: {str(e)}")
+        # Fallback tags if extraction fails
+        if not subject_tags:
+            subject_tags = DEFAULT_TAGS.copy()
+    # Ensure we have at least 3 tags
+    while len(subject_tags) < 3:
+        for tag in DEFAULT_TAGS:
+            if tag not in subject_tags:
+                subject_tags.append(tag)
+                break
+        else:
+            # If all default tags are already used, add generic ones
+            for tag in GENERIC_TAGS:
+                if tag not in subject_tags:
+                    subject_tags.append(tag)
+                    break
+            else:
+                # If we still can't add any more tags, break the loop
+                break
+    return subject_tags