Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

milwright commited on Mar 28

Commit

59aaeae

0 Parent(s):

Update Historical OCR with specified input files

Browse files

Files changed (24) hide show

.gitattributes +4 -0
README.md +46 -0
app.py +1672 -0
config.py +57 -0
input/The Magician, or Bottle Cungerer.jpeg +3 -0
input/americae-retectio.jpg +3 -0
input/handwritten-letter.jpg +3 -0
input/harpers.pdf +3 -0
input/magellan-travels.jpg +3 -0
input/milgram-flier.png +3 -0
input/recipe.jpg +3 -0
ocr_utils.py +1255 -0
packages.txt +2 -0
pdf_ocr.py +76 -0
process_file.py +68 -0
requirements.txt +17 -0
static/favicon.ico +0 -0
static/favicon.png +3 -0
static/scroll.svg +8 -0
structured_ocr.py +1718 -0
ui/__pycache__/layout.cpython-312.pyc +0 -0
ui/__pycache__/layout.cpython-313.pyc +0 -0
ui/custom.css +67 -0
ui/layout.py +172 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,4 @@

+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,46 @@

+---
+title: Historical OCR with Contextual Intelligence
+emoji: 📜
+colorFrom: indigo
+colorTo: purple
+sdk: streamlit
+sdk_version: "1.28.0"
+app_file: app.py
+pinned: false
+---
+# Historical OCR with Contextual Intelligence
+An advanced OCR application for historical document analysis using Mistral AI.
+## Features
+- **OCR with Context:** AI-enhanced OCR optimized for historical documents
+- **Document Type Detection:** Automatically identifies handwritten letters, recipes, scientific texts, and more
+- **Image Preprocessing:** Optimizes images for better text recognition
+- **Custom Prompting:** Tailor the AI analysis with document-specific instructions
+- **Structured Output:** Returns organized, structured information based on document type
+## Using This App
+1. Upload a historical document (image or PDF)
+2. Add optional context or special instructions
+3. Get detailed, structured OCR results with historical context
+## Supported Document Types
+- Handwritten letters and correspondence
+- Historical recipes and cookbooks
+- Travel accounts and exploration logs
+- Scientific papers and experiments
+- Legal documents and certificates
+- Historical newspaper articles
+- General historical texts
+## Technical Details
+Built with Streamlit and Mistral AI's OCR and large language model capabilities.
+---
+Created by Zach Muhlbauer, CUNY Graduate Center

app.py ADDED Viewed

	@@ -0,0 +1,1672 @@

+import os
+import streamlit as st
+import json
+import sys
+import time
+import base64
+from pathlib import Path
+import tempfile
+import io
+from pdf2image import convert_from_bytes
+from PIL import Image, ImageEnhance, ImageFilter
+import cv2
+import numpy as np
+from datetime import datetime
+# Import the StructuredOCR class and config from the local files
+from structured_ocr import StructuredOCR
+from config import MISTRAL_API_KEY
+# Import utilities for handling previous results
+from ocr_utils import create_results_zip
+def get_base64_from_image(image_path):
+    """Get base64 string from image file"""
+    with open(image_path, "rb") as img_file:
+        return base64.b64encode(img_file.read()).decode('utf-8')
+# Set favicon path
+favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
+# Set page configuration
+st.set_page_config(
+    page_title="Historical OCR",
+    page_icon=favicon_path if os.path.exists(favicon_path) else "📜",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Enable caching for expensive operations with longer TTL for better performance
+@st.cache_data(ttl=24*3600, show_spinner=False)  # Cache for 24 hours instead of 1 hour
+def convert_pdf_to_images(pdf_bytes, dpi=150, rotation=0):
+    """Convert PDF bytes to a list of images with caching"""
+    try:
+        images = convert_from_bytes(pdf_bytes, dpi=dpi)
+        # Apply rotation if specified
+        if rotation != 0 and images:
+            rotated_images = []
+            for img in images:
+                rotated_img = img.rotate(rotation, expand=True, resample=Image.BICUBIC)
+                rotated_images.append(rotated_img)
+            return rotated_images
+        return images
+    except Exception as e:
+        st.error(f"Error converting PDF: {str(e)}")
+        return []
+# Cache preprocessed images for better performance
+@st.cache_data(ttl=24*3600, show_spinner=False)  # Cache for 24 hours
+def preprocess_image(image_bytes, preprocessing_options):
+    """Preprocess image with selected options optimized for historical document OCR quality"""
+    # Setup basic console logging
+    import logging
+    logger = logging.getLogger("image_preprocessor")
+    logger.setLevel(logging.INFO)
+    # Log which preprocessing options are being applied
+    logger.info(f"Preprocessing image with options: {preprocessing_options}")
+    # Convert bytes to PIL Image
+    image = Image.open(io.BytesIO(image_bytes))
+    # Check for alpha channel (RGBA) and convert to RGB if needed
+    if image.mode == 'RGBA':
+        # Convert RGBA to RGB by compositing the image onto a white background
+        background = Image.new('RGB', image.size, (255, 255, 255))
+        background.paste(image, mask=image.split()[3])  # 3 is the alpha channel
+        image = background
+        logger.info("Converted RGBA image to RGB")
+    elif image.mode not in ('RGB', 'L'):
+        # Convert other modes to RGB as well
+        image = image.convert('RGB')
+        logger.info(f"Converted {image.mode} image to RGB")
+    # Apply rotation if specified
+    if preprocessing_options.get("rotation", 0) != 0:
+        rotation_degrees = preprocessing_options.get("rotation")
+        image = image.rotate(rotation_degrees, expand=True, resample=Image.BICUBIC)
+    # Resize large images while preserving details important for OCR
+    width, height = image.size
+    max_dimension = max(width, height)
+    # Less aggressive resizing to preserve document details
+    if max_dimension > 2500:
+        scale_factor = 2500 / max_dimension
+        new_width = int(width * scale_factor)
+        new_height = int(height * scale_factor)
+        # Use LANCZOS for better quality preservation
+        image = image.resize((new_width, new_height), Image.LANCZOS)
+    img_array = np.array(image)
+    # Apply preprocessing based on selected options with settings optimized for historical documents
+    document_type = preprocessing_options.get("document_type", "standard")
+    # Process grayscale option first as it's a common foundation
+    if preprocessing_options.get("grayscale", False):
+        if len(img_array.shape) == 3:  # Only convert if it's not already grayscale
+            if document_type == "handwritten":
+                # Enhanced grayscale processing for handwritten documents
+                img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+                # Apply adaptive histogram equalization to enhance handwriting
+                clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+                img_array = clahe.apply(img_array)
+            else:
+                # Standard grayscale for printed documents
+                img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            # Convert back to RGB for further processing
+            img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+    if preprocessing_options.get("contrast", 0) != 0:
+        contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
+        image = Image.fromarray(img_array)
+        enhancer = ImageEnhance.Contrast(image)
+        image = enhancer.enhance(contrast_factor)
+        img_array = np.array(image)
+    if preprocessing_options.get("denoise", False):
+        try:
+            # Apply appropriate denoising based on document type
+            if document_type == "handwritten":
+                # Very light denoising for handwritten documents to preserve pen strokes
+                if len(img_array.shape) == 3 and img_array.shape[2] == 3:  # Color image
+                    img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 3, 3, 5, 9)
+                else:  # Grayscale image
+                    img_array = cv2.fastNlMeansDenoising(img_array, None, 3, 7, 21)
+            else:
+                # Standard denoising for printed documents
+                if len(img_array.shape) == 3 and img_array.shape[2] == 3:  # Color image
+                    img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 5, 5, 7, 21)
+                else:  # Grayscale image
+                    img_array = cv2.fastNlMeansDenoising(img_array, None, 5, 7, 21)
+        except Exception as e:
+            print(f"Denoising error: {str(e)}, falling back to standard processing")
+    # Convert back to PIL Image
+    processed_image = Image.fromarray(img_array)
+    # Higher quality for OCR processing
+    byte_io = io.BytesIO()
+    try:
+        # Make sure the image is in RGB mode before saving as JPEG
+        if processed_image.mode not in ('RGB', 'L'):
+            processed_image = processed_image.convert('RGB')
+        processed_image.save(byte_io, format='JPEG', quality=92, optimize=True)
+        byte_io.seek(0)
+        logger.info(f"Preprocessing complete. Original image mode: {image.mode}, processed mode: {processed_image.mode}")
+        logger.info(f"Original size: {len(image_bytes)/1024:.1f}KB, processed size: {len(byte_io.getvalue())/1024:.1f}KB")
+        return byte_io.getvalue()
+    except Exception as e:
+        logger.error(f"Error saving processed image: {str(e)}")
+        # Fallback to original image
+        logger.info("Using original image as fallback")
+        image_io = io.BytesIO()
+        image.save(image_io, format='JPEG', quality=92)
+        image_io.seek(0)
+        return image_io.getvalue()
+# Cache OCR results in memory to speed up repeated processing
+@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
+def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key):
+    """Cached version of OCR processing to reuse results"""
+    # Initialize OCR processor
+    processor = StructuredOCR()
+    # Process the file
+    result = processor.process_file(
+        file_path,
+        file_type=file_type,
+        use_vision=use_vision,
+        file_size_mb=file_size_mb
+    )
+    return result
+# Define functions
+def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_container=None):
+    """Process the uploaded file and return the OCR results
+    Args:
+        uploaded_file: The uploaded file to process
+        use_vision: Whether to use vision model
+        preprocessing_options: Dictionary of preprocessing options
+        progress_container: Optional container for progress indicators
+    """
+    if preprocessing_options is None:
+        preprocessing_options = {}
+    # Create a container for progress indicators if not provided
+    if progress_container is None:
+        progress_container = st.empty()
+    with progress_container.container():
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        status_text.markdown('<div class="processing-status-container">Preparing file for processing...</div>', unsafe_allow_html=True)
+    try:
+        # Check if API key is available
+        if not MISTRAL_API_KEY:
+            # Return dummy data if no API key
+            progress_bar.progress(100)
+            status_text.empty()
+            return {
+                "file_name": uploaded_file.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "ocr_contents": {
+                    "title": "API Key Required",
+                    "content": "Please set the MISTRAL_API_KEY environment variable to process documents."
+                }
+            }
+        # Update progress - more granular steps
+        progress_bar.progress(10)
+        status_text.markdown('<div class="processing-status-container">Initializing OCR processor...</div>', unsafe_allow_html=True)
+        # Determine file type from extension
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        file_type = "pdf" if file_ext == ".pdf" else "image"
+        file_bytes = uploaded_file.getvalue()
+        # Create a temporary file for processing
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
+            tmp.write(file_bytes)
+            temp_path = tmp.name
+        # Get PDF rotation value if available and file is a PDF
+        pdf_rotation_value = pdf_rotation if 'pdf_rotation' in locals() and file_type == "pdf" else 0
+        progress_bar.progress(15)
+        # For PDFs, we need to handle differently
+        if file_type == "pdf":
+            status_text.markdown('<div class="processing-status-container">Converting PDF to images...</div>', unsafe_allow_html=True)
+            progress_bar.progress(20)
+            # Convert PDF to images
+            try:
+                # Use the PDF processing pipeline directly from the StructuredOCR class
+                processor = StructuredOCR()
+                # Process the file with direct PDF handling
+                progress_bar.progress(30)
+                status_text.markdown('<div class="processing-status-container">Processing PDF with OCR...</div>', unsafe_allow_html=True)
+                # Get file size in MB for API limits
+                file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
+                # Check if file exceeds API limits (50 MB)
+                if file_size_mb > 50:
+                    os.unlink(temp_path)  # Clean up temp file
+                    progress_bar.progress(100)
+                    status_text.empty()
+                    progress_container.empty()
+                    return {
+                        "file_name": uploaded_file.name,
+                        "topics": ["Document"],
+                        "languages": ["English"],
+                        "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                        "ocr_contents": {
+                            "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                            "partial_text": "Document could not be processed due to size limitations."
+                        }
+                    }
+                # Generate cache key
+                import hashlib
+                file_hash = hashlib.md5(file_bytes).hexdigest()
+                cache_key = f"{file_hash}_{file_type}_{use_vision}_{pdf_rotation_value}"
+                # Process with cached function if possible
+                try:
+                    result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key)
+                    progress_bar.progress(90)
+                    status_text.markdown('<div class="processing-status-container">Finalizing results...</div>', unsafe_allow_html=True)
+                except Exception as e:
+                    status_text.markdown(f'<div class="processing-status-container">Processing error: {str(e)}. Retrying...</div>', unsafe_allow_html=True)
+                    progress_bar.progress(60)
+                    # If caching fails, process directly
+                    result = processor.process_file(
+                        temp_path,
+                        file_type=file_type,
+                        use_vision=use_vision,
+                        file_size_mb=file_size_mb,
+                    )
+                    progress_bar.progress(90)
+                    status_text.markdown('<div class="processing-status-container">Finalizing results...</div>', unsafe_allow_html=True)
+            except Exception as e:
+                os.unlink(temp_path)  # Clean up temp file
+                progress_bar.progress(100)
+                status_text.empty()
+                progress_container.empty()
+                raise ValueError(f"Error processing PDF: {str(e)}")
+        else:
+            # For image files, apply preprocessing if needed
+            # Check if any preprocessing options with boolean values are True, or if any non-boolean values are non-default
+            has_preprocessing = (
+                preprocessing_options.get("grayscale", False) or
+                preprocessing_options.get("denoise", False) or
+                preprocessing_options.get("contrast", 0) != 0 or
+                preprocessing_options.get("rotation", 0) != 0 or
+                preprocessing_options.get("document_type", "standard") != "standard"
+            )
+            if has_preprocessing:
+                status_text.markdown('<div class="processing-status-container">Applying image preprocessing...</div>', unsafe_allow_html=True)
+                progress_bar.progress(20)
+                processed_bytes = preprocess_image(file_bytes, preprocessing_options)
+                progress_bar.progress(25)
+                # Save processed image to temp file
+                with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as proc_tmp:
+                    proc_tmp.write(processed_bytes)
+                    # Clean up original temp file and use the processed one
+                    if os.path.exists(temp_path):
+                        os.unlink(temp_path)
+                    temp_path = proc_tmp.name
+                progress_bar.progress(30)
+            else:
+                progress_bar.progress(30)
+            # Get file size in MB for API limits
+            file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
+            # Check if file exceeds API limits (50 MB)
+            if file_size_mb > 50:
+                os.unlink(temp_path)  # Clean up temp file
+                progress_bar.progress(100)
+                status_text.empty()
+                progress_container.empty()
+                return {
+                    "file_name": uploaded_file.name,
+                    "topics": ["Document"],
+                    "languages": ["English"],
+                    "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                    "ocr_contents": {
+                        "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                        "partial_text": "Document could not be processed due to size limitations."
+                    }
+                }
+            # Update progress - more granular steps
+            progress_bar.progress(40)
+            status_text.markdown('<div class="processing-status-container">Preparing document for OCR analysis...</div>', unsafe_allow_html=True)
+            # Generate a cache key based on file content, type and settings
+            import hashlib
+            file_hash = hashlib.md5(open(temp_path, 'rb').read()).hexdigest()
+            cache_key = f"{file_hash}_{file_type}_{use_vision}"
+            progress_bar.progress(50)
+            status_text.markdown('<div class="processing-status-container">Processing document with OCR...</div>', unsafe_allow_html=True)
+            # Process the file using cached function if possible
+            try:
+                result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key)
+                progress_bar.progress(80)
+                status_text.markdown('<div class="processing-status-container">Analyzing document structure...</div>', unsafe_allow_html=True)
+                progress_bar.progress(90)
+                status_text.markdown('<div class="processing-status-container">Finalizing results...</div>', unsafe_allow_html=True)
+            except Exception as e:
+                progress_bar.progress(60)
+                status_text.markdown(f'<div class="processing-status-container">Processing error: {str(e)}. Retrying...</div>', unsafe_allow_html=True)
+                # If caching fails, process directly
+                processor = StructuredOCR()
+                result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
+                progress_bar.progress(90)
+                status_text.markdown('<div class="processing-status-container">Finalizing results...</div>', unsafe_allow_html=True)
+        # Complete progress
+        progress_bar.progress(100)
+        status_text.markdown('<div class="processing-status-container">Processing complete!</div>', unsafe_allow_html=True)
+        time.sleep(0.8)  # Brief pause to show completion
+        status_text.empty()
+        progress_container.empty()  # Remove progress indicators when done
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            try:
+                os.unlink(temp_path)
+            except:
+                pass # Ignore errors when cleaning up temporary files
+        return result
+    except Exception as e:
+        progress_bar.progress(100)
+        error_message = str(e)
+        # Check for specific error types and provide helpful user-facing messages
+        if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
+            friendly_message = "The AI service is currently experiencing high demand. Please try again in a few minutes."
+            logger = logging.getLogger("app")
+            logger.error(f"Rate limit error: {error_message}")
+            status_text.markdown(f'<div class="processing-status-container" style="border-left-color: #ff9800;">Rate Limit: {friendly_message}</div>', unsafe_allow_html=True)
+        elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
+            friendly_message = "The API usage quota has been reached. Please check your API key and subscription limits."
+            status_text.markdown(f'<div class="processing-status-container" style="border-left-color: #ef5350;">API Quota: {friendly_message}</div>', unsafe_allow_html=True)
+        else:
+            status_text.markdown(f'<div class="processing-status-container" style="border-left-color: #ef5350;">Error: {error_message}</div>', unsafe_allow_html=True)
+        time.sleep(1.5)  # Show error briefly
+        status_text.empty()
+        progress_container.empty()
+        # Display an appropriate error message based on the exception type
+        if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
+            st.warning(f"API Rate Limit: {friendly_message} This is a temporary issue and does not indicate any problem with your document.")
+        elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
+            st.error(f"API Quota Exceeded: {friendly_message}")
+        else:
+            st.error(f"Error during processing: {error_message}")
+        # Clean up the temporary file
+        try:
+            if 'temp_path' in locals() and os.path.exists(temp_path):
+                os.unlink(temp_path)
+        except:
+            pass  # Ignore errors when cleaning up temporary files
+        raise
+# App title and description
+favicon_base64 = get_base64_from_image(os.path.join(os.path.dirname(__file__), "static/favicon.png"))
+st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><img src="data:image/png;base64,{favicon_base64}" width="36" height="36" alt="Scroll Icon"/> <h1 style="margin: 0; padding: 0;">Historical Document OCR</h1></div>', unsafe_allow_html=True)
+st.subheader("Powered by Mistral AI")
+# Check if pytesseract is available for fallback
+try:
+    import pytesseract
+    has_pytesseract = True
+except ImportError:
+    has_pytesseract = False
+# Initialize session state for storing previous results if not already present
+if 'previous_results' not in st.session_state:
+    st.session_state.previous_results = []
+# Create main layout with tabs and columns
+main_tab1, main_tab2, main_tab3 = st.tabs(["Document Processing", "Previous Results", "About"])
+with main_tab1:
+    # Create a two-column layout for file upload and results
+    left_col, right_col = st.columns([1, 1])
+    # File uploader in the left column
+    with left_col:
+        st.markdown("""
+        Upload an image or PDF file to get started.
+        Using the latest `mistral-ocr-latest` model for advanced document understanding.
+        """)
+        uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
+        # Removed seed prompt instructions from here, moving to sidebar
+# Sidebar with options
+with st.sidebar:
+    st.header("Options")
+    # Model options
+    st.subheader("Model Settings")
+    use_vision = st.checkbox("Use Vision Model", value=True,
+                            help="For image files, use the vision model for improved analysis (may be slower)")
+    # Historical Context section moved up
+    st.subheader("Historical Context")
+    # Historical period selector
+    historical_periods = [
+        "Select period (if known)",
+        "Pre-1700s",
+        "18th Century (1700s)",
+        "19th Century (1800s)",
+        "Early 20th Century (1900-1950)",
+        "Modern (Post 1950)"
+    ]
+    selected_period = st.selectbox(
+        "Historical Period",
+        options=historical_periods,
+        index=0,
+        help="Select the time period of the document for better OCR processing"
+    )
+    # Document purpose selector
+    document_purposes = [
+        "Select purpose (if known)",
+        "Personal Letter/Correspondence",
+        "Official/Government Document",
+        "Business/Financial Record",
+        "Literary/Academic Work",
+        "News/Journalism",
+        "Religious Text",
+        "Legal Document"
+    ]
+    selected_purpose = st.selectbox(
+        "Document Purpose",
+        options=document_purposes,
+        index=0,
+        help="Select the purpose or type of the document for better OCR processing"
+    )
+    # Custom prompt field
+    custom_prompt_text = ""
+    if selected_period != "Select period (if known)":
+        custom_prompt_text += f"This is a {selected_period} document. "
+    if selected_purpose != "Select purpose (if known)":
+        custom_prompt_text += f"It appears to be a {selected_purpose}. "
+    custom_prompt = st.text_area(
+        "Additional Context",
+        value=custom_prompt_text,
+        placeholder="Example: This document has unusual handwriting with cursive script. Please identify any mentioned locations and dates.",
+        height=150,
+        max_chars=500,
+        key="custom_analysis_instructions",
+        help="Powerful instructions field that impacts how the AI processes your document. Can request translations, format images correctly, extract specific information, or handle challenging documents. See the 'Additional Context Instructions & Examples' section below for more details."
+    )
+    # Enhanced instructions for Additional Context with more capabilities
+    with st.expander("Prompting Instructions"):
+        st.markdown("""
+        ### How Additional Context Affects Processing
+        The "Additional Context" field provides instructions directly to the AI to influence how it processes your document. Use it to:
+        #### Document Understanding
+        - **Specify handwriting styles**: "This document uses old-fashioned cursive with numerous flourishes and abbreviations"
+        - **Identify language features**: "The text contains archaic spellings common in 18th century documents"
+        - **Highlight focus areas**: "Look for mentions of financial transactions or dates of travel"
+        #### Output Formatting & Languages
+        - **Request translations**: "After extracting the text, translate the content into Spanish"
+        - **Format image orientation**: "Ensure images are displayed in the same orientation as they appear in the document"
+        - **Format tables**: "Convert any tables in the document to structured format with clear columns"
+        #### Special Processing
+        - **Handle challenges**: "Some portions may be faded; the page edges contain handwritten notes"
+        - **Technical terms**: "This is a medical document with specialized terminology about surgical procedures"
+        - **Organization**: "Separate the letter content from the address blocks and signature"
+        #### Example Combinations
+        ```
+        This is a handwritten letter from the 1850s. The writer uses archaic spellings and formal language.
+        Please preserve paragraph structure, identify any place names mentioned, and note any references
+        to historical events. Format any lists as bullet points.
+        ```
+        """)
+    # Image preprocessing options (collapsible)
+    st.subheader("Image Preprocessing")
+    with st.expander("Preprocessing Options"):
+        preprocessing_options = {}
+        # Document type selector - important for optimized processing
+        doc_type_options = ["standard", "handwritten", "typed", "printed"]
+        preprocessing_options["document_type"] = st.selectbox(
+            "Document Type",
+            options=doc_type_options,
+            index=0,  # Default to standard
+            format_func=lambda x: x.capitalize(),
+            help="Select document type for optimized processing - choose 'Handwritten' for letters and manuscripts"
+        )
+        preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
+                                                        help="Convert image to grayscale before OCR")
+        preprocessing_options["denoise"] = st.checkbox("Denoise Image",
+                                                     help="Remove noise from the image")
+        preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
+                                                    help="Adjust image contrast (-5 to +5)")
+        # Add rotation options
+        rotation_options = [0, 90, 180, 270]
+        preprocessing_options["rotation"] = st.select_slider(
+            "Rotate Document",
+            options=rotation_options,
+            value=0,
+            format_func=lambda x: f"{x}° {'(No rotation)' if x == 0 else ''}",
+            help="Rotate the document to correct orientation"
+        )
+    # PDF options (collapsible)
+    st.subheader("PDF Options")
+    with st.expander("PDF Settings"):
+        pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 100,
+                          help="Higher DPI gives better quality but slower processing. Try 100 for faster processing.")
+        max_pages = st.number_input("Maximum Pages to Process", 1, 20, 3,
+                                  help="Limit number of pages to process")
+        # Add PDF rotation option
+        rotation_options = [0, 90, 180, 270]
+        pdf_rotation = st.select_slider(
+            "Rotate PDF",
+            options=rotation_options,
+            value=0,
+            format_func=lambda x: f"{x}° {'(No rotation)' if x == 0 else ''}",
+            help="Rotate the PDF pages to correct orientation"
+        )
+        # Store PDF rotation separately instead of in preprocessing_options
+        # This prevents conflict with image preprocessing
+# Previous Results tab content
+with main_tab2:
+    st.markdown('<h2>Previous Results</h2>', unsafe_allow_html=True)
+    # Load custom CSS for Previous Results tab
+    from ui.layout import load_css
+    load_css()
+    # Display previous results if available
+    if not st.session_state.previous_results:
+        st.markdown("""
+        <div class="previous-results-container" style="text-align: center; padding: 40px 20px;">
+            <div style="font-size: 48px; margin-bottom: 20px; color: #757575;">📄</div>
+            <h3 style="color: #212121; margin-bottom: 10px;">No Previous Results</h3>
+            <p style="color: #616161;">Process a document to see your results history saved here.</p>
+        </div>
+        """, unsafe_allow_html=True)
+    else:
+        # Create a container for the results list
+        st.markdown('<div class="previous-results-container">', unsafe_allow_html=True)
+        st.markdown(f'<h3>{len(st.session_state.previous_results)} Previous Results</h3>', unsafe_allow_html=True)
+        # Create two columns for filters and download buttons
+        filter_col, download_col = st.columns([2, 1])
+        with filter_col:
+            # Add filter options
+            filter_options = ["All Types"]
+            if any(result.get("file_name", "").lower().endswith(".pdf") for result in st.session_state.previous_results):
+                filter_options.append("PDF Documents")
+            if any(result.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png")) for result in st.session_state.previous_results):
+                filter_options.append("Images")
+            selected_filter = st.selectbox("Filter by Type:", filter_options)
+        with download_col:
+            # Add download all button for results
+            if len(st.session_state.previous_results) > 0:
+                try:
+                    # Create buffer in memory instead of file on disk
+                    import io
+                    from ocr_utils import create_results_zip_in_memory
+                    # Get zip data directly in memory
+                    zip_data = create_results_zip_in_memory(st.session_state.previous_results)
+                    st.download_button(
+                        label="Download All Results",
+                        data=zip_data,
+                        file_name="all_ocr_results.zip",
+                        mime="application/zip",
+                        help="Download all previous results as a ZIP file containing HTML and JSON files"
+                    )
+                except Exception as e:
+                    st.error(f"Error creating download: {str(e)}")
+                    st.info("Try with fewer results or individual downloads")
+        # Filter results based on selection
+        filtered_results = st.session_state.previous_results
+        if selected_filter == "PDF Documents":
+            filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith(".pdf")]
+        elif selected_filter == "Images":
+            filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png"))]
+        # Show a message if no results match the filter
+        if not filtered_results:
+            st.markdown("""
+            <div style="text-align: center; padding: 20px; background-color: #f9f9f9; border-radius: 5px; margin: 20px 0;">
+                <p>No results match the selected filter.</p>
+            </div>
+            """, unsafe_allow_html=True)
+        # Display each result as a card
+        for i, result in enumerate(filtered_results):
+            # Determine file type icon
+            file_name = result.get("file_name", f"Document {i+1}")
+            file_type_lower = file_name.lower()
+            if file_type_lower.endswith(".pdf"):
+                icon = "📄"
+            elif file_type_lower.endswith((".jpg", ".jpeg", ".png", ".gif")):
+                icon = "🖼️"
+            else:
+                icon = "📝"
+            # Create a card for each result
+            st.markdown(f"""
+            <div class="result-card">
+                <div class="result-header">
+                    <div class="result-filename">{icon} {file_name}</div>
+                    <div class="result-date">{result.get('timestamp', 'Unknown')}</div>
+                </div>
+                <div class="result-metadata">
+                    <div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
+                    <div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown']))}</div>
+                </div>
+            """, unsafe_allow_html=True)
+            # Add view button inside the card with proper styling
+            st.markdown('<div class="result-action-button">', unsafe_allow_html=True)
+            if st.button(f"View Document", key=f"view_{i}"):
+                # Set the selected result in the session state
+                st.session_state.selected_previous_result = st.session_state.previous_results[i]
+                # Force a rerun to show the selected result
+                st.rerun()
+            st.markdown('</div>', unsafe_allow_html=True)
+            # Close the result card
+            st.markdown('</div>', unsafe_allow_html=True)
+        # Close the container
+        st.markdown('</div>', unsafe_allow_html=True)
+        # Display the selected result if available
+        if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
+            selected_result = st.session_state.selected_previous_result
+            # Create a styled container for the selected result
+            st.markdown(f"""
+            <div class="selected-result-container">
+                <div class="result-header" style="margin-bottom: 20px;">
+                    <div class="selected-result-title">Selected Document: {selected_result.get('file_name', 'Unknown')}</div>
+                    <div class="result-date">{selected_result.get('timestamp', '')}</div>
+                </div>
+            """, unsafe_allow_html=True)
+            # Display metadata in a styled way
+            meta_col1, meta_col2 = st.columns(2)
+            with meta_col1:
+                # Display document metadata
+                if 'languages' in selected_result:
+                    languages = [lang for lang in selected_result['languages'] if lang is not None]
+                    if languages:
+                        st.write(f"**Languages:** {', '.join(languages)}")
+                if 'topics' in selected_result and selected_result['topics']:
+                    st.write(f"**Topics:** {', '.join(selected_result['topics'])}")
+            with meta_col2:
+                # Display processing metadata
+                if 'limited_pages' in selected_result:
+                    st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages")
+                if 'processing_time' in selected_result:
+                    proc_time = selected_result['processing_time']
+                    st.write(f"**Processing Time:** {proc_time:.1f}s")
+            # Create tabs for content display
+            has_images = selected_result.get('has_images', False)
+            if has_images:
+                view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
+            else:
+                view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
+            with view_tab1:
+                # Display structured content
+                if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
+                    for section, content in selected_result['ocr_contents'].items():
+                        if content and section not in ['error', 'raw_text', 'partial_text']:  # Skip error and raw text sections
+                            st.markdown(f"#### {section.replace('_', ' ').title()}")
+                            if isinstance(content, str):
+                                st.write(content)
+                            elif isinstance(content, list):
+                                for item in content:
+                                    if isinstance(item, str):
+                                        st.write(f"- {item}")
+                                    else:
+                                        st.write(f"- {str(item)}")
+                            elif isinstance(content, dict):
+                                for k, v in content.items():
+                                    st.write(f"**{k}:** {v}")
+            with view_tab2:
+                # Show the raw JSON with an option to download it
+                st.json(selected_result)
+                # Add JSON download button
+                json_str = json.dumps(selected_result, indent=2)
+                filename = selected_result.get('file_name', 'document').split('.')[0]
+                st.download_button(
+                    label="Download JSON",
+                    data=json_str,
+                    file_name=f"{filename}_data.json",
+                    mime="application/json"
+                )
+            if has_images and 'pages_data' in selected_result:
+                with view_tab3:
+                    # Display content with images in a nicely formatted way
+                    pages_data = selected_result.get('pages_data', [])
+                    # Process and display each page
+                    for page_idx, page in enumerate(pages_data):
+                        # Add a page header if multi-page
+                        if len(pages_data) > 1:
+                            st.markdown(f"### Page {page_idx + 1}")
+                        # Create columns for better layout
+                        if page.get('images'):
+                            # Extract images for this page
+                            images = page.get('images', [])
+                            for img in images:
+                                if 'image_base64' in img:
+                                    st.image(img['image_base64'], width=600)
+                            # Display text content if available
+                            text_content = page.get('markdown', '')
+                            if text_content:
+                                with st.expander("View Page Text", expanded=True):
+                                    st.markdown(text_content)
+                        else:
+                            # Just display text if no images
+                            text_content = page.get('markdown', '')
+                            if text_content:
+                                st.markdown(text_content)
+                        # Add page separator
+                        if page_idx < len(pages_data) - 1:
+                            st.markdown("---")
+                    # Add HTML download button if images are available
+                    from ocr_utils import create_html_with_images
+                    html_content = create_html_with_images(selected_result)
+                    filename = selected_result.get('file_name', 'document').split('.')[0]
+                    st.download_button(
+                        label="Download as HTML with Images",
+                        data=html_content,
+                        file_name=f"{filename}_with_images.html",
+                        mime="text/html"
+                    )
+            # Close the container
+            st.markdown('</div>', unsafe_allow_html=True)
+            # Add clear button outside the container with proper styling
+            col1, col2, col3 = st.columns([1, 1, 1])
+            with col2:
+                st.markdown('<div class="result-action-button" style="text-align: center;">', unsafe_allow_html=True)
+                if st.button("Close Selected Document", key="close_selected"):
+                    # Clear the selected result from session state
+                    del st.session_state.selected_previous_result
+                    # Force a rerun to update the view
+                    st.rerun()
+                st.markdown('</div>', unsafe_allow_html=True)
+# About tab content
+with main_tab3:
+    # Add a notice about local OCR fallback if available
+    fallback_notice = ""
+    if 'has_pytesseract' in locals() and has_pytesseract:
+        fallback_notice = """
+    **Local OCR Fallback:**
+    - Local OCR fallback using Tesseract is available if API rate limits are reached
+    - Provides basic text extraction when cloud OCR is unavailable
+    """
+    st.markdown(f"""
+    ### About This Application
+    This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
+    It can process:
+    - Image files (jpg, png, etc.)
+    - PDF documents (multi-page support)
+    The extracted content is processed into structured data based on the document type, combining:
+    - Text extraction with `mistral-ocr-latest`
+    - Analysis with language models
+    - Layout preservation with images
+    View results in three formats:
+    - Structured HTML view
+    - Raw JSON (for developers)
+    - Markdown with images (preserves document layout)
+    **New Features:**
+    - Image preprocessing for better OCR quality
+    - PDF resolution and page controls
+    - Document rotation (90°, 180°, 270°)
+    - Custom instructions for special document analysis
+    - Performance mode selection (Speed/Balance/Quality)
+    - Progress tracking during processing
+    - Previous Results tab to review processed documents
+    - Enhanced rate limit handling with automatic retry
+    {fallback_notice}
+    """)
+with main_tab1:
+    if uploaded_file is not None:
+        # Check file size (cap at 50MB)
+        file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
+        if file_size_mb > 50:
+            with left_col:
+                st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
+            st.stop()
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        # Process button - flush left with similar padding as file browser
+        with left_col:
+            process_button = st.button("Process Document")
+            # Image preprocessing preview in upload column, right after the process button
+            if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
+                with st.expander("Image Preprocessing Preview"):
+                    preview_cols = st.columns(2)
+                    with preview_cols[0]:
+                        st.markdown("**Original Image**")
+                        st.image(uploaded_file, width=600)
+                    with preview_cols[1]:
+                        st.markdown("**Preprocessed Image**")
+                        try:
+                            processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
+                            st.image(io.BytesIO(processed_bytes), width=600)
+                        except Exception as e:
+                            st.error(f"Error in preprocessing: {str(e)}")
+                            st.info("Try using grayscale preprocessing for PNG images with transparency")
+            # Empty container for progress indicators - will be filled during processing
+            progress_placeholder = st.empty()
+            # Add space (one inch equivalent in Streamlit)
+            st.markdown("<div style='margin-top: 72px;'></div>", unsafe_allow_html=True)
+            # Container for document metadata (will be filled after processing)
+            metadata_placeholder = st.empty()
+        # Results section
+        if process_button:
+            # Move the progress indicator reference to just below the button
+            progress_container = progress_placeholder
+            try:
+                # Get max_pages or default if not available
+                max_pages_value = max_pages if 'max_pages' in locals() else None
+                # Apply performance mode settings
+                if 'perf_mode' in locals():
+                    if perf_mode == "Speed":
+                        # Override settings for faster processing
+                        if 'preprocessing_options' in locals():
+                            preprocessing_options["denoise"] = False  # Skip denoising for speed
+                        if 'pdf_dpi' in locals() and file_ext.lower() == '.pdf':
+                            pdf_dpi = min(pdf_dpi, 100)  # Lower DPI for speed
+                # Process file with or without custom prompt
+                if custom_prompt and custom_prompt.strip():
+                    # Process with custom instructions for the AI
+                    with progress_placeholder.container():
+                        progress_bar = st.progress(0)
+                        status_text = st.empty()
+                        status_text.markdown('<div class="processing-status-container">Processing with custom instructions...</div>', unsafe_allow_html=True)
+                        progress_bar.progress(30)
+                    # Special handling for PDF files with custom prompts
+                    if file_ext.lower() == ".pdf":
+                        # For PDFs with custom prompts, we use a special two-step process
+                        with progress_placeholder.container():
+                            status_text.markdown('<div class="processing-status-container">Using special PDF processing for custom instructions...</div>', unsafe_allow_html=True)
+                            progress_bar.progress(40)
+                            try:
+                                # Step 1: Process without custom prompt to get OCR text
+                                processor = StructuredOCR()
+                                # First save the PDF to a temp file
+                                with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                                    tmp.write(uploaded_file.getvalue())
+                                    temp_path = tmp.name
+                                # Process with NO custom prompt first
+                                base_result = processor.process_file(
+                                    file_path=temp_path,
+                                    file_type="pdf",
+                                    use_vision=use_vision,
+                                    custom_prompt=None,  # No custom prompt in first step
+                                    file_size_mb=len(uploaded_file.getvalue()) / (1024 * 1024)
+                                )
+                                progress_bar.progress(70)
+                                status_text.markdown('<div class="processing-status-container">Applying custom analysis to extracted text...</div>', unsafe_allow_html=True)
+                                # Step 2: Apply custom prompt to the extracted text using text-only LLM
+                                if 'ocr_contents' in base_result and isinstance(base_result['ocr_contents'], dict):
+                                    # Get text from OCR result
+                                    ocr_text = ""
+                                    for section, content in base_result['ocr_contents'].items():
+                                        if isinstance(content, str):
+                                            ocr_text += content + "\n\n"
+                                        elif isinstance(content, list):
+                                            for item in content:
+                                                if isinstance(item, str):
+                                                    ocr_text += item + "\n"
+                                            ocr_text += "\n"
+                                    # Format the custom prompt for text-only processing
+                                    formatted_prompt = f"USER INSTRUCTIONS: {custom_prompt.strip()}\nPay special attention to these instructions and respond accordingly."
+                                    # Apply custom prompt to extracted text
+                                    enhanced_result = processor._extract_structured_data_text_only(ocr_text, uploaded_file.name, formatted_prompt)
+                                    # Merge results, keeping images from base_result
+                                    result = base_result.copy()
+                                    result['custom_prompt_applied'] = 'text_only'
+                                    # Update with enhanced analysis results, preserving image data
+                                    for key, value in enhanced_result.items():
+                                        if key not in ['raw_response_data', 'pages_data', 'has_images']:
+                                            result[key] = value
+                                else:
+                                    # If no OCR content, just use the base result
+                                    result = base_result
+                                    result['custom_prompt_applied'] = 'failed'
+                                # Clean up temp file
+                                if os.path.exists(temp_path):
+                                    os.unlink(temp_path)
+                            except Exception as e:
+                                # If anything fails, revert to standard processing
+                                st.warning(f"Special PDF processing failed. Falling back to standard method: {str(e)}")
+                                result = process_file(uploaded_file, use_vision, {}, progress_container=progress_placeholder)
+                    else:
+                        # For non-PDF files, use normal processing with custom prompt
+                        # Save the uploaded file to a temporary file with preprocessing
+                        with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+                            # Apply preprocessing if any options are selected
+                            if any(preprocessing_options.values()):
+                                # Apply performance mode settings
+                                if 'perf_mode' in locals() and perf_mode == "Speed":
+                                    # Skip denoising for speed in preprocessing
+                                    speed_preprocessing = preprocessing_options.copy()
+                                    speed_preprocessing["denoise"] = False
+                                    processed_bytes = preprocess_image(uploaded_file.getvalue(), speed_preprocessing)
+                                else:
+                                    processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
+                                tmp.write(processed_bytes)
+                            else:
+                                tmp.write(uploaded_file.getvalue())
+                            temp_path = tmp.name
+                        # Show progress
+                        with progress_placeholder.container():
+                            progress_bar.progress(50)
+                            status_text.markdown('<div class="processing-status-container">Analyzing with custom instructions...</div>', unsafe_allow_html=True)
+                        # Initialize OCR processor and process with custom prompt
+                        processor = StructuredOCR()
+                        # Format the custom prompt to ensure it has an impact
+                        formatted_prompt = f"USER INSTRUCTIONS: {custom_prompt.strip()}\nPay special attention to these instructions and respond accordingly."
+                        try:
+                            result = processor.process_file(
+                                file_path=temp_path,
+                                file_type="image",  # Always use image for non-PDFs
+                                use_vision=use_vision,
+                                custom_prompt=formatted_prompt,
+                                file_size_mb=len(uploaded_file.getvalue()) / (1024 * 1024)
+                            )
+                        except Exception as e:
+                            # For any error, fall back to standard processing
+                            st.warning(f"Custom prompt processing failed. Falling back to standard processing: {str(e)}")
+                            result = process_file(uploaded_file, use_vision, preprocessing_options, progress_container=progress_placeholder)
+                    # Complete progress
+                    with progress_placeholder.container():
+                        progress_bar.progress(100)
+                        status_text.markdown('<div class="processing-status-container">Processing complete!</div>', unsafe_allow_html=True)
+                        time.sleep(0.8)
+                        progress_placeholder.empty()
+                    # Clean up temporary file
+                    if os.path.exists(temp_path):
+                        try:
+                            os.unlink(temp_path)
+                        except:
+                            pass
+                else:
+                    # Standard processing without custom prompt
+                    result = process_file(uploaded_file, use_vision, preprocessing_options, progress_container=progress_placeholder)
+                # Display Document Contents in the right column
+                with right_col:
+                    st.subheader("Document Contents")
+                    # Start document content div with consistent styling class
+                    st.markdown('<div class="document-content">', unsafe_allow_html=True)
+                    if 'ocr_contents' in result:
+                        # Check for has_images in the result
+                        has_images = result.get('has_images', False)
+                        # Create tabs for different views
+                        if has_images:
+                            view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
+                        else:
+                            view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
+                    with view_tab1:
+                        # Display in a more user-friendly format based on the content structure
+                        html_content = ""
+                        if isinstance(result['ocr_contents'], dict):
+                            for section, content in result['ocr_contents'].items():
+                                if content:  # Only display non-empty sections
+                                    # Add consistent styling for each section
+                                    section_title = f'<h4 style="font-family: Georgia, serif; font-size: 18px; margin-top: 20px; margin-bottom: 10px;">{section.replace("_", " ").title()}</h4>'
+                                    html_content += section_title
+                                    if isinstance(content, str):
+                                        # Optimize by using a expander for very long content
+                                        if len(content) > 1000:
+                                            # Format content for long text - bold everything after "... that"
+                                            preview_content = content[:1000] + "..." if len(content) > 1000 else content
+                                            if "... that" in content:
+                                                # For the preview (first 1000 chars)
+                                                if "... that" in preview_content:
+                                                    parts = preview_content.split("... that", 1)
+                                                    formatted_preview = f"{parts[0]}... that<strong>{parts[1]}</strong>"
+                                                    html_content += f"<p style=\"font-size:16px;\">{formatted_preview}</p>"
+                                                else:
+                                                    html_content += f"<p style=\"font-size:16px; font-weight:normal;\">{preview_content}</p>"
+                                                # For the full content in expander
+                                                parts = content.split("... that", 1)
+                                                formatted_full = f"{parts[0]}... that**{parts[1]}**"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                with st.expander("Show full content"):
+                                                    st.markdown(formatted_full)
+                                            else:
+                                                html_content += f"<p style=\"font-size:16px; font-weight:normal;\">{preview_content}</p>"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                with st.expander("Show full content"):
+                                                    st.write(content)
+                                        else:
+                                            # Format content - bold everything after "... that"
+                                            if "... that" in content:
+                                                parts = content.split("... that", 1)
+                                                formatted_content = f"{parts[0]}... that<strong>{parts[1]}</strong>"
+                                                html_content += f"<p style=\"font-size:16px;\">{formatted_content}</p>"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                st.markdown(f"{parts[0]}... that**{parts[1]}**")
+                                            else:
+                                                html_content += f"<p style=\"font-size:16px; font-weight:normal;\">{content}</p>"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                st.write(content)
+                                    elif isinstance(content, list):
+                                        html_list = "<ul>"
+                                        st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                        # Limit display for very long lists
+                                        if len(content) > 20:
+                                            with st.expander(f"Show all {len(content)} items"):
+                                                for item in content:
+                                                    if isinstance(item, str):
+                                                        html_list += f"<li>{item}</li>"
+                                                        st.write(f"- {item}")
+                                                    elif isinstance(item, dict):
+                                                        st.json(item)
+                                        else:
+                                            for item in content:
+                                                if isinstance(item, str):
+                                                    html_list += f"<li>{item}</li>"
+                                                    st.write(f"- {item}")
+                                                elif isinstance(item, dict):
+                                                    st.json(item)
+                                        html_list += "</ul>"
+                                        html_content += html_list
+                                    elif isinstance(content, dict):
+                                        html_dict = "<dl>"
+                                        st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                        for k, v in content.items():
+                                            html_dict += f"<dt>{k}</dt><dd>{v}</dd>"
+                                            st.write(f"**{k}:** {v}")
+                                        html_dict += "</dl>"
+                                        html_content += html_dict
+                        # Add download button in a smaller section
+                        with st.expander("Export Content"):
+                            # Get original filename without extension
+                            original_name = Path(result.get('file_name', uploaded_file.name)).stem
+                            # HTML download button
+                            html_bytes = html_content.encode()
+                            st.download_button(
+                                label="Download as HTML",
+                                data=html_bytes,
+                                file_name=f"{original_name}_processed.html",
+                                mime="text/html"
+                            )
+                    with view_tab2:
+                        # Show the raw JSON for developers, with an expander for large results
+                        if len(json.dumps(result)) > 5000:
+                            with st.expander("View full JSON"):
+                                st.json(result)
+                        else:
+                            st.json(result)
+                    if has_images and 'pages_data' in result:
+                        with view_tab3:
+                            # Use pages_data directly instead of raw_response
+                            try:
+                                # Use the serialized pages data
+                                pages_data = result.get('pages_data', [])
+                                if not pages_data:
+                                    st.warning("No image data found in the document.")
+                                    st.stop()
+                                # Construct markdown from pages_data directly
+                                from ocr_utils import replace_images_in_markdown
+                                combined_markdown = ""
+                                for page in pages_data:
+                                    page_markdown = page.get('markdown', '')
+                                    images = page.get('images', [])
+                                    # Create image dictionary
+                                    image_dict = {}
+                                    for img in images:
+                                        if 'id' in img and 'image_base64' in img:
+                                            image_dict[img['id']] = img['image_base64']
+                                    # Replace image references in markdown
+                                    if page_markdown and image_dict:
+                                        page_markdown = replace_images_in_markdown(page_markdown, image_dict)
+                                        combined_markdown += page_markdown + "\n\n---\n\n"
+                                if not combined_markdown:
+                                    st.warning("No content with images found.")
+                                    st.stop()
+                                # Add CSS for better image handling
+                                st.markdown("""
+                                <style>
+                                .image-container {
+                                    margin: 20px 0;
+                                    text-align: center;
+                                }
+                                .markdown-text-container {
+                                    padding: 10px;
+                                    background-color: #f9f9f9;
+                                    border-radius: 5px;
+                                }
+                                .markdown-text-container img {
+                                    margin: 15px auto;
+                                    max-width: 90%;
+                                    max-height: 500px;
+                                    object-fit: contain;
+                                    border: 1px solid #ddd;
+                                    border-radius: 4px;
+                                    display: block;
+                                }
+                                .markdown-text-container p {
+                                    margin-bottom: 16px;
+                                    line-height: 1.6;
+                                    font-family: Georgia, serif;
+                                }
+                                .page-break {
+                                    border-top: 1px solid #ddd;
+                                    margin: 20px 0;
+                                    padding-top: 20px;
+                                }
+                                .page-text-content {
+                                    margin-bottom: 20px;
+                                }
+                                .text-block {
+                                    background-color: #fff;
+                                    padding: 15px;
+                                    border-radius: 4px;
+                                    border-left: 3px solid #546e7a;
+                                    margin-bottom: 15px;
+                                    color: #333;
+                                }
+                                .text-block p {
+                                    margin: 8px 0;
+                                    color: #333;
+                                }
+                                </style>
+                                """, unsafe_allow_html=True)
+                                # Process and display content with images properly
+                                import re
+                                # Process each page separately
+                                pages_content = []
+                                # Check if this is from a PDF processed through pdf2image
+                                is_pdf2image = result.get('pdf_processing_method') == 'pdf2image'
+                                for i, page in enumerate(pages_data):
+                                    page_markdown = page.get('markdown', '')
+                                    images = page.get('images', [])
+                                    if not page_markdown:
+                                        continue
+                                    # Create image dictionary
+                                    image_dict = {}
+                                    for img in images:
+                                        if 'id' in img and 'image_base64' in img:
+                                            image_dict[img['id']] = img['image_base64']
+                                    # Create HTML content for this page
+                                    page_html = f"<h3>Page {i+1}</h3>" if i > 0 else ""
+                                    # Display the raw text content first to ensure it's visible
+                                    page_html += f"<div class='page-text-content'>"
+                                    # Special handling for PDF2image processed documents
+                                    if is_pdf2image and i == 0 and 'ocr_contents' in result:
+                                        # Display all structured content from OCR for PDFs
+                                        page_html += "<div class='text-block pdf-content'>"
+                                        # Check if custom prompt was applied
+                                        if result.get('custom_prompt_applied') == 'text_only':
+                                            page_html += "<div class='prompt-info'><i>Custom analysis applied using text-only processing</i></div>"
+                                        ocr_contents = result.get('ocr_contents', {})
+                                        # Get a sorted list of sections to ensure consistent order
+                                        section_keys = sorted(ocr_contents.keys())
+                                        # Place important sections first
+                                        priority_sections = ['title', 'subtitle', 'header', 'publication', 'date', 'content', 'main_text']
+                                        for important in priority_sections:
+                                            if important in ocr_contents and important in section_keys:
+                                                section_keys.remove(important)
+                                                section_keys.insert(0, important)
+                                        for section in section_keys:
+                                            content = ocr_contents[section]
+                                            if section in ['raw_text', 'error', 'partial_text']:
+                                                continue  # Skip these fields
+                                            section_title = section.replace('_', ' ').title()
+                                            page_html += f"<h4>{section_title}</h4>"
+                                            if isinstance(content, str):
+                                                # Convert newlines to <br> tags
+                                                content_html = content.replace('\n', '<br>')
+                                                page_html += f"<p>{content_html}</p>"
+                                            elif isinstance(content, list):
+                                                page_html += "<ul>"
+                                                for item in content:
+                                                    if isinstance(item, str):
+                                                        page_html += f"<li>{item}</li>"
+                                                    elif isinstance(item, dict):
+                                                        page_html += "<li>"
+                                                        for k, v in item.items():
+                                                            page_html += f"<strong>{k}:</strong> {v}<br>"
+                                                        page_html += "</li>"
+                                                    else:
+                                                        page_html += f"<li>{str(item)}</li>"
+                                                page_html += "</ul>"
+                                            elif isinstance(content, dict):
+                                                for k, v in content.items():
+                                                    if isinstance(v, str):
+                                                        page_html += f"<p><strong>{k}:</strong> {v}</p>"
+                                                    elif isinstance(v, list):
+                                                        page_html += f"<p><strong>{k}:</strong></p><ul>"
+                                                        for item in v:
+                                                            page_html += f"<li>{item}</li>"
+                                                        page_html += "</ul>"
+                                                    else:
+                                                        page_html += f"<p><strong>{k}:</strong> {str(v)}</p>"
+                                        page_html += "</div>"
+                                    else:
+                                        # Standard processing for regular documents
+                                        # Get all text content that isn't an image and add it first
+                                        text_content = []
+                                        for line in page_markdown.split("\n"):
+                                            if not re.search(r'!\[(.*?)\]\((.*?)\)', line) and line.strip():
+                                                text_content.append(line)
+                                        # Add the text content as a block
+                                        if text_content:
+                                            page_html += f"<div class='text-block'>"
+                                            for line in text_content:
+                                                page_html += f"<p>{line}</p>"
+                                            page_html += "</div>"
+                                    page_html += "</div>"
+                                    # Then add images separately
+                                    for line in page_markdown.split("\n"):
+                                        # Handle image lines
+                                        img_match = re.search(r'!\[(.*?)\]\((.*?)\)', line)
+                                        if img_match:
+                                            alt_text = img_match.group(1)
+                                            img_ref = img_match.group(2)
+                                            # Get the base64 data for this image ID
+                                            img_data = image_dict.get(img_ref, "")
+                                            if img_data:
+                                                img_html = f'<div class="image-container"><img src="{img_data}" alt="{alt_text}"></div>'
+                                                page_html += img_html
+                                    # Add page separator if not the last page
+                                    if i < len(pages_data) - 1:
+                                        page_html += '<div class="page-break"></div>'
+                                    pages_content.append(page_html)
+                                # Combine all pages HTML
+                                html_content = "\n".join(pages_content)
+                                # Wrap the content in a div with the class for styling
+                                st.markdown(f"""
+                                <div class="markdown-text-container">
+                                {html_content}
+                                </div>
+                                """, unsafe_allow_html=True)
+                                # Create download HTML content
+                                download_html = f"""
+                                <html>
+                                <head>
+                                    <style>
+                                    body {{
+                                        font-family: Georgia, serif;
+                                        line-height: 1.7;
+                                        margin: 0 auto;
+                                        max-width: 800px;
+                                        padding: 20px;
+                                    }}
+                                    img {{
+                                        max-width: 90%;
+                                        max-height: 500px;
+                                        object-fit: contain;
+                                        margin: 20px auto;
+                                        display: block;
+                                        border: 1px solid #ddd;
+                                        border-radius: 4px;
+                                    }}
+                                    .image-container {{
+                                        margin: 20px 0;
+                                        text-align: center;
+                                    }}
+                                    .page-break {{
+                                        border-top: 1px solid #ddd;
+                                        margin: 40px 0;
+                                        padding-top: 40px;
+                                    }}
+                                    h3 {{
+                                        color: #333;
+                                        border-bottom: 1px solid #eee;
+                                        padding-bottom: 10px;
+                                    }}
+                                    p {{
+                                        margin: 12px 0;
+                                    }}
+                                    .page-text-content {{
+                                        margin-bottom: 20px;
+                                    }}
+                                    .text-block {{
+                                        background-color: #f9f9f9;
+                                        padding: 15px;
+                                        border-radius: 4px;
+                                        border-left: 3px solid #546e7a;
+                                        margin-bottom: 15px;
+                                        color: #333;
+                                    }}
+                                    .text-block p {{
+                                        margin: 8px 0;
+                                        color: #333;
+                                    }}
+                                    </style>
+                                </head>
+                                <body>
+                                <div class="markdown-text-container">
+                                {html_content}
+                                </div>
+                                </body>
+                                </html>
+                                """
+                                # Get original filename without extension
+                                original_name = Path(result.get('file_name', uploaded_file.name)).stem
+                                # Add download button as an expander to prevent page reset
+                                with st.expander("Download Document with Images"):
+                                    st.markdown("Click the button below to download the document with embedded images")
+                                    st.download_button(
+                                        label="Download as HTML",
+                                        data=download_html,
+                                        file_name=f"{original_name}_with_images.html",
+                                        mime="text/html",
+                                        key="download_with_images_button"
+                                    )
+                            except Exception as e:
+                                st.error(f"Could not display document with images: {str(e)}")
+                                st.info("Try refreshing or processing the document again.")
+                    if 'ocr_contents' not in result:
+                        st.error("No OCR content was extracted from the document.")
+                    # Close document content div
+                    st.markdown('</div>', unsafe_allow_html=True)
+                # Add Document Metadata in the left column placeholder
+                with metadata_placeholder.container():
+                    st.subheader("Document Metadata")
+                    st.success("**Document processed successfully**")
+                    # Display file info
+                    st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
+                    # Display info if only limited pages were processed
+                    if 'limited_pages' in result:
+                        st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
+                    # Display languages if available
+                    if 'languages' in result:
+                        languages = [lang for lang in result['languages'] if lang is not None]
+                        if languages:
+                            st.write(f"**Languages:** {', '.join(languages)}")
+                    # Display topics if available
+                    if 'topics' in result and result['topics']:
+                        st.write(f"**Topics:** {', '.join(result['topics'])}")
+                    # Processing time if available
+                    if 'processing_time' in result:
+                        proc_time = result['processing_time']
+                        st.write(f"**Processing Time:** {proc_time:.1f}s")
+                # Store the result in the previous results list
+                # Add timestamp to result for history tracking
+                result_copy = result.copy()
+                result_copy['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M")
+                # Add to session state, keeping the most recent 20 results
+                st.session_state.previous_results.insert(0, result_copy)
+                if len(st.session_state.previous_results) > 20:
+                    st.session_state.previous_results = st.session_state.previous_results[:20]
+            except Exception as e:
+                st.error(f"Error processing document: {str(e)}")
+    else:
+        # Display basic info when no file is uploaded
+        st.markdown('<div style="text-align: left; width: auto; display: inline-block;">Upload a document to get started using the file uploader above.</div>', unsafe_allow_html=True)
+        # Show example images in a grid
+        st.subheader("Example Documents")
+        # Add a sample images container
+        with st.container():
+            # Find sample images from the input directory to display
+            input_dir = Path(__file__).parent / "input"
+            sample_images = []
+            backup_dir = Path(__file__).parent / "backup" / "input"
+            if input_dir.exists():
+                # Define images in specific order per requirements
+                ordered_sample_images = []
+                # Define ordered list: magellan, americae, handwritten letter, milgram flier, recipe, magician
+                ordered_image_names = [
+                    "magellan-travels.jpg",
+                    "americae-retectio.jpg",
+                    "handwritten-letter.jpg",
+                    "milgram-flier.png",
+                    "recipe.jpg",
+                    "The Magician, or Bottle Cungerer.jpeg"
+                ]
+                # Create the image list in the desired order
+                for img_name in ordered_image_names:
+                    img_path = input_dir / img_name
+                    if img_path.exists():
+                        ordered_sample_images.append(img_path)
+                # Organize for display: first 3 in top row, next 3 in bottom row
+                sample_images = ordered_sample_images
+                # If we don't have enough samples, fill in with other available images
+                if len(sample_images) < 6:
+                    # Get all remaining images from input directory
+                    all_images = set(
+                        list(input_dir.glob("*.jpg")) +
+                        list(input_dir.glob("*.jpeg")) +
+                        list(input_dir.glob("*.png")) +
+                        list(input_dir.glob("*.tif"))
+                    )
+                    # Remove the already selected images
+                    remaining_images = [img for img in all_images if img not in sample_images]
+                    # Add remaining images to fill the grid
+                    sample_images.extend(remaining_images[:6-len(sample_images)])
+                # If still not enough, try backup directory
+                if len(sample_images) < 6 and backup_dir.exists():
+                    remaining = 6 - len(sample_images)
+                    backup_samples = (
+                        list(backup_dir.glob("*.jpg")) +
+                        list(backup_dir.glob("*.jpeg")) +
+                        list(backup_dir.glob("*.png"))
+                    )[:remaining]
+                    sample_images.extend(backup_samples)
+            if sample_images:
+                # Create two rows of 3 columns each for the 6 examples
+                if len(sample_images) > 3:
+                    # First row
+                    columns1 = st.columns(3)
+                    for i, img_path in enumerate(sample_images[:3]):
+                        with columns1[i]:
+                            if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tif']:
+                                try:
+                                    st.image(str(img_path), caption=img_path.name, width=300)
+                                except Exception:
+                                    st.info(f"Example: {img_path.name}")
+                            else:
+                                # For PDFs, show an icon or info message
+                                st.info(f"PDF Example: {img_path.name}")
+                    # Second row
+                    columns2 = st.columns(3)
+                    for i, img_path in enumerate(sample_images[3:6]):
+                        with columns2[i]:
+                            if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tif']:
+                                try:
+                                    st.image(str(img_path), caption=img_path.name, width=300)
+                                except Exception:
+                                    st.info(f"Example: {img_path.name}")
+                            else:
+                                # For PDFs, show an icon or info message
+                                st.info(f"PDF Example: {img_path.name}")
+                else:
+                    # If we have 3 or fewer samples, just use one row
+                    columns = st.columns(min(3, len(sample_images)))
+                    for i, img_path in enumerate(sample_images):
+                        with columns[i % len(columns)]:
+                            if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tif']:
+                                try:
+                                    st.image(str(img_path), caption=img_path.name, width=300)
+                                except Exception:
+                                    st.info(f"Example: {img_path.name}")
+                            else:
+                                # For PDFs, show an icon or info message
+                                st.info(f"PDF Example: {img_path.name}")
+            else:
+                st.info("No example documents found. Upload your own document to get started.")

config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# config.py
+"""
+Configuration file for Mistral OCR processing.
+Contains API key and other settings.
+"""
+import os
+import logging
+from dotenv import load_dotenv
+# Configure logging
+logger = logging.getLogger("config")
+# Load environment variables from .env file if it exists
+load_dotenv()
+# Mistral API key handling - get from Hugging Face secrets or environment variable
+# The priority order is:
+# 1. HF_MISTRAL_API_KEY environment var (for Hugging Face deployment)
+# 2. MISTRAL_API_KEY environment var (standard environment variable)
+# 3. Empty string (will show warning in app)
+MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY",
+                  os.environ.get("MISTRAL_API_KEY", "")).strip()
+# Check if we're in test mode (allows operation without valid API key)
+TEST_MODE = False  # Disable test mode for production use
+# Just check if API key exists
+if not MISTRAL_API_KEY and not TEST_MODE:
+    logger.warning("No Mistral API key found. OCR functionality will not work unless TEST_MODE is enabled.")
+if TEST_MODE:
+    logger.info("TEST_MODE is enabled. Using mock responses instead of actual API calls.")
+# Model settings with fallbacks
+OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
+TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest")  # Updated from ministral-8b-latest
+VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-large-latest")  # Updated from pixtral-12b-latest
+# Image preprocessing settings optimized for historical documents
+# These can be customized from environment variables
+IMAGE_PREPROCESSING = {
+    "enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "1.8")),  # Increased contrast for better text recognition
+    "sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
+    "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
+    "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")),    # Increased size limit for better quality
+    "target_dpi": int(os.environ.get("TARGET_DPI", "300")),               # Target DPI for scaling
+    "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95"))  # Higher quality for better OCR results
+}
+# OCR settings optimized for reliability and performance
+OCR_SETTINGS = {
+    "timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "120000")),        # Extended timeout for larger documents
+    "max_retries": int(os.environ.get("OCR_MAX_RETRIES", "3")),           # Increased retry attempts for better reliability
+    "retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "2")),           # Longer initial retry delay for better success rate
+    "include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
+    "thread_count": int(os.environ.get("OCR_THREAD_COUNT", "4"))          # Thread count for parallel processing
+}

input/The Magician, or Bottle Cungerer.jpeg ADDED Viewed

Git LFS Details

SHA256: 3becaf6f5548a794436864885bb125f3fa09f1e6f7bdd76e8878f2d36ff26232
Pointer size: 132 Bytes
Size of remote file: 2.96 MB

input/americae-retectio.jpg ADDED Viewed

Git LFS Details

SHA256: 3ea42f6d3f7c0331a08321c26978c9011843965de99735a178de8167fdede544
Pointer size: 131 Bytes
Size of remote file: 452 kB

input/handwritten-letter.jpg ADDED Viewed

Git LFS Details

SHA256: 7fe2d81bb4e8bef7cdbf87c58a8cc180c49c313e5099de167ae37bbbfb895e88
Pointer size: 131 Bytes
Size of remote file: 231 kB

input/harpers.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c9030714b07bb5f7c9adf8b175975baa9b4f40402da62d69cad9b0d4ba61b94
+size 14931299

input/magellan-travels.jpg ADDED Viewed

Git LFS Details

SHA256: ae3e860789e2c3c8032499e5326864294dbc1b01059169fd08203c980577010b
Pointer size: 131 Bytes
Size of remote file: 283 kB

input/milgram-flier.png ADDED Viewed

Git LFS Details

SHA256: 0e1ca2821304427dcf7e2c9e0a03de880f44146bf8fa6abc9a437249fda85486
Pointer size: 130 Bytes
Size of remote file: 88.5 kB

input/recipe.jpg ADDED Viewed

Git LFS Details

SHA256: 8bdb2a05dee10e4e181d8636714915f3055c664297e512f805fea180446624b2
Pointer size: 130 Bytes
Size of remote file: 70.8 kB

ocr_utils.py ADDED Viewed

	@@ -0,0 +1,1255 @@

+"""
+Utility functions for OCR processing with Mistral AI.
+Contains helper functions for working with OCR responses and image handling.
+"""
+import json
+import base64
+import io
+import zipfile
+import logging
+import numpy as np
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Union, Any, Tuple
+from functools import lru_cache
+# Configure logging
+logger = logging.getLogger("ocr_utils")
+try:
+    from PIL import Image, ImageEnhance, ImageFilter, ImageOps
+    import cv2
+    PILLOW_AVAILABLE = True
+    CV2_AVAILABLE = True
+except ImportError as e:
+    # Check which image libraries are available
+    if "PIL" in str(e):
+        PILLOW_AVAILABLE = False
+    if "cv2" in str(e):
+        CV2_AVAILABLE = False
+from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
+# Import configuration
+try:
+    from config import IMAGE_PREPROCESSING
+except ImportError:
+    # Fallback defaults if config not available
+    IMAGE_PREPROCESSING = {
+        "enhance_contrast": 1.5,
+        "sharpen": True,
+        "denoise": True,
+        "max_size_mb": 8.0,
+        "target_dpi": 300,
+        "compression_quality": 92
+    }
+def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
+    """
+    Replace image placeholders in markdown with base64-encoded images.
+    Args:
+        markdown_str: Markdown text containing image placeholders
+        images_dict: Dictionary mapping image IDs to base64 strings
+    Returns:
+        Markdown text with images replaced by base64 data
+    """
+    for img_name, base64_str in images_dict.items():
+        markdown_str = markdown_str.replace(
+            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
+        )
+    return markdown_str
+def get_combined_markdown(ocr_response) -> str:
+    """
+    Combine OCR text and images into a single markdown document.
+    Args:
+        ocr_response: OCR response object from Mistral AI
+    Returns:
+        Combined markdown string with embedded images
+    """
+    markdowns = []
+    # Process each page of the OCR response
+    for page in ocr_response.pages:
+        # Extract image data if available
+        image_data = {}
+        if hasattr(page, "images"):
+            for img in page.images:
+                if hasattr(img, "id") and hasattr(img, "image_base64"):
+                    image_data[img.id] = img.image_base64
+        # Replace image placeholders with base64 data
+        page_markdown = page.markdown if hasattr(page, "markdown") else ""
+        processed_markdown = replace_images_in_markdown(page_markdown, image_data)
+        markdowns.append(processed_markdown)
+    # Join all pages' markdown with double newlines
+    return "\n\n".join(markdowns)
+def encode_image_for_api(image_path: Union[str, Path]) -> str:
+    """
+    Encode an image as base64 data URL for API submission.
+    Args:
+        image_path: Path to the image file
+    Returns:
+        Base64 data URL for the image
+    """
+    # Convert to Path object if string
+    image_file = Path(image_path) if isinstance(image_path, str) else image_path
+    # Verify image exists
+    if not image_file.is_file():
+        raise FileNotFoundError(f"Image file not found: {image_file}")
+    # Encode image as base64
+    encoded = base64.b64encode(image_file.read_bytes()).decode()
+    return f"data:image/jpeg;base64,{encoded}"
+def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
+    """
+    Process an image with OCR and return the response.
+    Args:
+        client: Mistral AI client
+        image_path: Path to the image file
+        model: OCR model to use
+    Returns:
+        OCR response object
+    """
+    # Encode image as base64
+    base64_data_url = encode_image_for_api(image_path)
+    # Process image with OCR
+    image_response = client.ocr.process(
+        document=ImageURLChunk(image_url=base64_data_url),
+        model=model
+    )
+    return image_response
+def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
+    """
+    Convert OCR response to a formatted JSON string.
+    Args:
+        ocr_response: OCR response object
+        indent: Indentation level for JSON formatting
+    Returns:
+        Formatted JSON string
+    """
+    # Convert OCR response to a dictionary
+    response_dict = {
+        "text": ocr_response.text if hasattr(ocr_response, "text") else "",
+        "pages": []
+    }
+    # Process pages if available
+    if hasattr(ocr_response, "pages"):
+        for page in ocr_response.pages:
+            page_dict = {
+                "text": page.text if hasattr(page, "text") else "",
+                "markdown": page.markdown if hasattr(page, "markdown") else "",
+                "images": []
+            }
+            # Process images if available
+            if hasattr(page, "images"):
+                for img in page.images:
+                    img_dict = {
+                        "id": img.id if hasattr(img, "id") else "",
+                        "base64": img.image_base64 if hasattr(img, "image_base64") else ""
+                    }
+                    page_dict["images"].append(img_dict)
+            response_dict["pages"].append(page_dict)
+    # Convert dictionary to JSON
+    return json.dumps(response_dict, indent=indent)
+def create_results_zip_in_memory(results):
+    """
+    Create a zip file containing OCR results in memory.
+    Args:
+        results: Dictionary or list of OCR results
+    Returns:
+        Binary zip file data
+    """
+    # Create a BytesIO object
+    zip_buffer = io.BytesIO()
+    # Check if results is a list or a dictionary
+    is_list = isinstance(results, list)
+    # Create zip file in memory
+    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        if is_list:
+            # Handle list of results
+            for i, result in enumerate(results):
+                try:
+                    # Add JSON results for each file
+                    result_json = json.dumps(result, indent=2)
+                    zipf.writestr(f"results_{i+1}.json", result_json)
+                    # Add HTML content (generated from the result)
+                    html_content = create_html_with_images(result)
+                    filename = result.get('file_name', f'document_{i+1}').split('.')[0]
+                    zipf.writestr(f"{filename}_with_images.html", html_content)
+                    # Add raw OCR text if available
+                    if "ocr_contents" in result and "raw_text" in result["ocr_contents"]:
+                        zipf.writestr(f"ocr_text_{i+1}.txt", result["ocr_contents"]["raw_text"])
+                    # Add HTML visualization if available
+                    if "html_visualization" in result:
+                        zipf.writestr(f"visualization_{i+1}.html", result["html_visualization"])
+                    # Add images if available (limit to conserve memory)
+                    if "pages_data" in result:
+                        for page_idx, page in enumerate(result["pages_data"]):
+                            for img_idx, img in enumerate(page.get("images", [])[:3]):  # Limit to first 3 images per page
+                                img_base64 = img.get("image_base64", "")
+                                if img_base64:
+                                    # Strip data URL prefix if present
+                                    if img_base64.startswith("data:image"):
+                                        img_base64 = img_base64.split(",", 1)[1]
+                                    # Decode base64 and add to zip
+                                    try:
+                                        img_data = base64.b64decode(img_base64)
+                                        zipf.writestr(f"images/result_{i+1}_page_{page_idx+1}_img_{img_idx+1}.jpg", img_data)
+                                    except:
+                                        pass
+                except Exception:
+                    # If any result fails, skip it and continue
+                    continue
+        else:
+            # Handle single result
+            try:
+                # Add JSON results
+                results_json = json.dumps(results, indent=2)
+                zipf.writestr("results.json", results_json)
+                # Add HTML content
+                html_content = create_html_with_images(results)
+                filename = results.get('file_name', 'document').split('.')[0]
+                zipf.writestr(f"{filename}_with_images.html", html_content)
+                # Add raw OCR text if available
+                if "ocr_contents" in results and "raw_text" in results["ocr_contents"]:
+                    zipf.writestr("ocr_text.txt", results["ocr_contents"]["raw_text"])
+                # Add HTML visualization if available
+                if "html_visualization" in results:
+                    zipf.writestr("visualization.html", results["html_visualization"])
+                # Add images if available
+                if "pages_data" in results:
+                    for page_idx, page in enumerate(results["pages_data"]):
+                        for img_idx, img in enumerate(page.get("images", [])):
+                            img_base64 = img.get("image_base64", "")
+                            if img_base64:
+                                # Strip data URL prefix if present
+                                if img_base64.startswith("data:image"):
+                                    img_base64 = img_base64.split(",", 1)[1]
+                                # Decode base64 and add to zip
+                                try:
+                                    img_data = base64.b64decode(img_base64)
+                                    zipf.writestr(f"images/page_{page_idx+1}_img_{img_idx+1}.jpg", img_data)
+                                except:
+                                    pass
+            except Exception:
+                # If processing fails, return empty zip
+                pass
+    # Seek to the beginning of the BytesIO object
+    zip_buffer.seek(0)
+    # Return the zip file bytes
+    return zip_buffer.getvalue()
+def create_results_zip(results, output_dir=None, zip_name=None):
+    """
+    Create a zip file containing OCR results.
+    Args:
+        results: Dictionary or list of OCR results
+        output_dir: Optional output directory
+        zip_name: Optional zip file name
+    Returns:
+        Path to the created zip file
+    """
+    # Create temporary output directory if not provided
+    if output_dir is None:
+        output_dir = Path.cwd() / "output"
+        output_dir.mkdir(exist_ok=True)
+    else:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(exist_ok=True)
+    # Check if results is a list or a dictionary
+    is_list = isinstance(results, list)
+    # Generate zip name if not provided
+    if zip_name is None:
+        if is_list:
+            # For list of results, use timestamp and generic name
+            timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+            zip_name = f"ocr-results_{timestamp}.zip"
+        else:
+            # For single result, use original file's info
+            # Check if processed_at exists, otherwise use current timestamp
+            if "processed_at" in results:
+                timestamp = results.get("processed_at", "").replace(":", "-").replace(".", "-")
+            else:
+                timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+            file_name = results.get("file_name", "ocr-results")
+            zip_name = f"{file_name}_{timestamp}.zip"
+    try:
+        # Get zip data in memory first
+        zip_data = create_results_zip_in_memory(results)
+        # Save to file
+        zip_path = output_dir / zip_name
+        with open(zip_path, 'wb') as f:
+            f.write(zip_data)
+        return zip_path
+    except Exception as e:
+        # Create an empty zip file as fallback
+        zip_path = output_dir / zip_name
+        with zipfile.ZipFile(zip_path, 'w') as zipf:
+            zipf.writestr("info.txt", "Could not create complete archive")
+        return zip_path
+# Advanced image preprocessing functions
+def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image, str]:
+    """
+    Preprocess an image for optimal OCR performance with enhanced speed and memory optimization.
+    Args:
+        image_path: Path to the image file
+    Returns:
+        Tuple of (processed PIL Image, base64 string)
+    """
+    # Fast path: Skip all processing if PIL not available
+    if not PILLOW_AVAILABLE:
+        logger.info("PIL not available, skipping image preprocessing")
+        return None, encode_image_for_api(image_path)
+    # Convert to Path object if string
+    image_file = Path(image_path) if isinstance(image_path, str) else image_path
+    # Thread-safe caching with early exit for already processed images
+    try:
+        # Fast stat calls for file metadata - consolidate to reduce I/O
+        file_stat = image_file.stat()
+        file_size = file_stat.st_size
+        file_size_mb = file_size / (1024 * 1024)
+        mod_time = file_stat.st_mtime
+        # Create a cache key based on essential file properties
+        cache_key = f"{image_file.name}_{file_size}_{mod_time}"
+        # Fast path: Return cached result if available
+        if hasattr(preprocess_image_for_ocr, "_cache") and cache_key in preprocess_image_for_ocr._cache:
+            logger.debug(f"Using cached preprocessing result for {image_file.name}")
+            return preprocess_image_for_ocr._cache[cache_key]
+        # Optimization: Skip heavy processing for very small files
+        # Small images (less than 100KB) likely don't need preprocessing
+        if file_size < 100000:  # 100KB
+            logger.info(f"Image {image_file.name} is small ({file_size/1024:.1f}KB), using minimal processing")
+            with Image.open(image_file) as img:
+                # Normalize mode only
+                if img.mode not in ('RGB', 'L'):
+                    img = img.convert('RGB')
+                # Save with light optimization
+                buffer = io.BytesIO()
+                img.save(buffer, format="JPEG", quality=95, optimize=True)
+                buffer.seek(0)
+                # Get base64
+                encoded_image = base64.b64encode(buffer.getvalue()).decode()
+                base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+                # Cache and return
+                result = (img, base64_data_url)
+                if not hasattr(preprocess_image_for_ocr, "_cache"):
+                    preprocess_image_for_ocr._cache = {}
+                # Clean cache if needed
+                if len(preprocess_image_for_ocr._cache) > 20:  # Increased cache size for better performance
+                    # Remove oldest 5 entries for better batch processing
+                    for _ in range(5):
+                        if preprocess_image_for_ocr._cache:
+                            preprocess_image_for_ocr._cache.pop(next(iter(preprocess_image_for_ocr._cache)))
+                preprocess_image_for_ocr._cache[cache_key] = result
+                return result
+    except Exception as e:
+        # If stat or cache handling fails, log and continue with processing
+        logger.debug(f"Cache handling failed for {image_path}: {str(e)}")
+        # Ensure we have a valid file_size_mb for later decisions
+        try:
+            file_size_mb = image_file.stat().st_size / (1024 * 1024)
+        except:
+            file_size_mb = 0  # Default if we can't determine size
+    try:
+        # Process start time for performance logging
+        start_time = time.time()
+        # Open and process the image with minimal memory footprint
+        with Image.open(image_file) as img:
+            # Normalize image mode
+            if img.mode not in ('RGB', 'L'):
+                img = img.convert('RGB')
+            # Fast path: Quick check of image properties to determine appropriate processing
+            width, height = img.size
+            image_area = width * height
+            # Detect document type only for medium to large images to save processing time
+            is_document = False
+            if image_area > 500000:  # Approx 700x700 or larger
+                # Store image for document detection
+                _detect_document_type_impl._current_img = img
+                is_document = _detect_document_type_impl(None)
+                logger.debug(f"Document type detection for {image_file.name}: {'document' if is_document else 'photo'}")
+            # Resize large images for API efficiency
+            if file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
+                # Calculate target dimensions directly instead of using the heavier resize function
+                target_width, target_height = width, height
+                max_dimension = max(width, height)
+                # Use a sliding scale for reduction based on image size
+                if max_dimension > 5000:
+                    scale_factor = 0.25  # Aggressive reduction for very large images
+                elif max_dimension > 3000:
+                    scale_factor = 0.4   # Significant reduction for large images
+                else:
+                    scale_factor = 0.6   # Moderate reduction for medium images
+                # Calculate new dimensions
+                new_width = int(width * scale_factor)
+                new_height = int(height * scale_factor)
+                # Use direct resize with optimized resampling filter based on image size
+                if image_area > 3000000:  # Very large, use faster but lower quality
+                    processed_img = img.resize((new_width, new_height), Image.BILINEAR)
+                else:  # Medium size, use better quality
+                    processed_img = img.resize((new_width, new_height), Image.LANCZOS)
+                logger.debug(f"Resized image from {width}x{height} to {new_width}x{new_height}")
+            else:
+                # Skip resizing for smaller images
+                processed_img = img
+            # Apply appropriate processing based on document type and size
+            if is_document:
+                # Process as document with optimized path based on size
+                if image_area > 1000000:  # Full processing for larger documents
+                    preprocess_document_image._current_img = processed_img
+                    processed = _preprocess_document_image_impl()
+                else:  # Lightweight processing for smaller documents
+                    # Just enhance contrast for small documents to save time
+                    enhancer = ImageEnhance.Contrast(processed_img)
+                    processed = enhancer.enhance(1.3)
+            else:
+                # Process as photo with optimized path based on size
+                if image_area > 1000000:  # Full processing for larger photos
+                    preprocess_general_image._current_img = processed_img
+                    processed = _preprocess_general_image_impl()
+                else:  # Skip processing for smaller photos
+                    processed = processed_img
+            # Optimize memory handling during encoding
+            buffer = io.BytesIO()
+            # Adjust quality based on image size to optimize API payload
+            if file_size_mb > 5:
+                quality = 85  # Lower quality for large files
+            else:
+                quality = IMAGE_PREPROCESSING["compression_quality"]
+            # Save with optimized parameters
+            processed.save(buffer, format="JPEG", quality=quality, optimize=True)
+            buffer.seek(0)
+            # Get base64 with minimal memory footprint
+            encoded_image = base64.b64encode(buffer.getvalue()).decode()
+            base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+            # Update cache thread-safely
+            result = (processed, base64_data_url)
+            if not hasattr(preprocess_image_for_ocr, "_cache"):
+                preprocess_image_for_ocr._cache = {}
+            # LRU-like cache management with improved clearing
+            if len(preprocess_image_for_ocr._cache) > 20:
+                try:
+                    # Remove several entries to avoid frequent cache clearing
+                    for _ in range(5):
+                        if preprocess_image_for_ocr._cache:
+                            preprocess_image_for_ocr._cache.pop(next(iter(preprocess_image_for_ocr._cache)))
+                except:
+                    # If removal fails, just continue
+                    pass
+            # Add to cache
+            try:
+                preprocess_image_for_ocr._cache[cache_key] = result
+            except Exception:
+                # If caching fails, just proceed
+                pass
+            # Log performance metrics
+            processing_time = time.time() - start_time
+            logger.debug(f"Image preprocessing completed in {processing_time:.3f}s for {image_file.name}")
+            # Return both processed image and base64 string
+            return result
+    except Exception as e:
+        # If preprocessing fails, log error and use original image
+        logger.warning(f"Image preprocessing failed: {str(e)}. Using original image.")
+        return None, encode_image_for_api(image_path)
+# Removed caching decorator to fix unhashable type error
+def detect_document_type(img: Image.Image) -> bool:
+    """
+    Detect if an image is likely a document (text-heavy) vs. a photo.
+    Args:
+        img: PIL Image object
+    Returns:
+        True if likely a document, False otherwise
+    """
+    # Direct implementation without caching
+    return _detect_document_type_impl(None)
+def _detect_document_type_impl(img_hash=None) -> bool:
+    """
+    Optimized implementation of document type detection for faster processing.
+    The img_hash parameter is unused but kept for backward compatibility.
+    """
+    # Fast path: Get the image from thread-local storage
+    if not hasattr(_detect_document_type_impl, "_current_img"):
+        return False  # Fail safe in case image is not set
+    img = _detect_document_type_impl._current_img
+    # Skip processing for tiny images - just classify as non-documents
+    width, height = img.size
+    if width * height < 100000:  # Approx 300x300 or smaller
+        return False
+    # Quick check: If image has many colors, it's likely not a document
+    # Sample a subset of pixels for color analysis (faster than full histogram)
+    try:
+        # Sample pixels in a grid pattern
+        color_samples = []
+        for x in range(0, width, max(1, width // 10)):
+            for y in range(0, height, max(1, height // 10)):
+                try:
+                    color_samples.append(img.getpixel((x, y)))
+                except:
+                    pass
+        # Count unique colors in the sample
+        if img.mode == 'RGB':
+            unique_colors = len(set(color_samples))
+            if unique_colors > 1000:  # Many unique colors suggest a photo, not a document
+                return False
+    except:
+        pass  # If sampling fails, continue with regular analysis
+    # Convert to grayscale for analysis (using faster conversion)
+    gray_img = img.convert('L')
+    # PIL-only path for systems without OpenCV
+    if not CV2_AVAILABLE:
+        # Faster method: Sample a subset of the image for edge detection
+        # Downscale image for faster processing
+        sample_size = min(width, height, 1000)
+        scale_factor = sample_size / max(width, height)
+        if scale_factor < 0.9:  # Only resize if significant reduction
+            sample_img = gray_img.resize(
+                (int(width * scale_factor), int(height * scale_factor)),
+                Image.NEAREST  # Fastest resampling method
+            )
+        else:
+            sample_img = gray_img
+        # Fast edge detection on sample
+        edges = sample_img.filter(ImageFilter.FIND_EDGES)
+        # Count edge pixels using threshold (faster than summing individual pixels)
+        edge_data = edges.getdata()
+        edge_threshold = 50
+        # Use list comprehension for better performance
+        edge_count = sum(1 for p in edge_data if p > edge_threshold)
+        total_pixels = len(edge_data)
+        edge_ratio = edge_count / total_pixels
+        # Check if bright areas exist - simple approximation of text/background contrast
+        bright_count = sum(1 for p in gray_img.getdata() if p > 200)
+        bright_ratio = bright_count / (width * height)
+        # Documents typically have more edges (text boundaries) and bright areas (background)
+        return edge_ratio > 0.05 or bright_ratio > 0.4
+    # OpenCV path - optimized for speed
+    img_np = np.array(gray_img)
+    # Fast document detection heuristics
+    # 1. Fast check: Variance of pixel values
+    # Documents typically have high variance (black text on white background)
+    # Use numpy's fast statistical functions
+    std_dev = np.std(img_np)
+    if std_dev > 60:  # High standard deviation suggests document
+        return True
+    # 2. Quick check using downsampled image for edges
+    # Downscale for faster processing on large images
+    if max(img_np.shape) > 1000:
+        scale = 1000 / max(img_np.shape)
+        small_img = cv2.resize(img_np, None, fx=scale, fy=scale, interpolation=cv2.INTER_NEAREST)
+    else:
+        small_img = img_np
+    # Use faster edge detection
+    edges = cv2.Canny(small_img, 50, 150, L2gradient=False)
+    edge_ratio = np.count_nonzero(edges) / edges.size
+    # 3. Fast histogram approximation using bins
+    # Instead of calculating full histogram, use bins for dark and light regions
+    dark_mask = img_np < 50
+    light_mask = img_np > 200
+    dark_ratio = np.count_nonzero(dark_mask) / img_np.size
+    light_ratio = np.count_nonzero(light_mask) / img_np.size
+    # Combine heuristics for final decision
+    # Documents typically have both dark (text) and light (background) regions,
+    # and/or well-defined edges
+    return (dark_ratio > 0.05 and light_ratio > 0.3) or edge_ratio > 0.04
+# Removed caching to fix unhashable type error
+def preprocess_document_image(img: Image.Image) -> Image.Image:
+    """
+    Preprocess a document image for optimal OCR.
+    Args:
+        img: PIL Image object
+    Returns:
+        Processed PIL Image
+    """
+    # Store the image for the implementation function
+    preprocess_document_image._current_img = img
+    # The actual implementation is separated for cleaner code organization
+    return _preprocess_document_image_impl()
+def _preprocess_document_image_impl() -> Image.Image:
+    """
+    Optimized implementation of document preprocessing with adaptive processing based on image size
+    """
+    # Fast path: Get image from thread-local storage
+    if not hasattr(preprocess_document_image, "_current_img"):
+        raise ValueError("No image set for document preprocessing")
+    img = preprocess_document_image._current_img
+    # Analyze image size to determine processing strategy
+    width, height = img.size
+    img_size = width * height
+    # Ultra-fast path for tiny images - just convert to grayscale with contrast enhancement
+    if img_size < 300000:  # ~500x600 or smaller
+        gray = img.convert('L')
+        enhancer = ImageEnhance.Contrast(gray)
+        return enhancer.enhance(IMAGE_PREPROCESSING["enhance_contrast"])
+    # Fast path for small images - minimal processing
+    if img_size < 1000000:  # ~1000x1000 or smaller
+        gray = img.convert('L')
+        enhancer = ImageEnhance.Contrast(gray)
+        enhanced = enhancer.enhance(IMAGE_PREPROCESSING["enhance_contrast"])
+        # Light sharpening only if sharpen is enabled
+        if IMAGE_PREPROCESSING["sharpen"]:
+            enhanced = enhanced.filter(ImageFilter.SHARPEN)
+        return enhanced
+    # Standard path for medium images
+    # Convert to grayscale (faster processing)
+    gray = img.convert('L')
+    # Improve contrast - key for text visibility
+    enhancer = ImageEnhance.Contrast(gray)
+    enhanced = enhancer.enhance(IMAGE_PREPROCESSING["enhance_contrast"])
+    # Apply light sharpening for text clarity
+    if IMAGE_PREPROCESSING["sharpen"]:
+        enhanced = enhanced.filter(ImageFilter.SHARPEN)
+    # Advanced processing for larger images or when OpenCV is available
+    # The following optimizations improve OCR accuracy significantly for complex documents
+    if img_size > 1500000 and CV2_AVAILABLE and IMAGE_PREPROCESSING["denoise"]:
+        try:
+            # Convert to numpy array for OpenCV processing
+            img_np = np.array(enhanced)
+            # Optimize denoising parameters based on image size
+            if img_size > 4000000:  # Very large images (~2000x2000 or larger)
+                # More aggressive downsampling for very large images
+                scale_factor = 0.5
+                downsample = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
+                                      interpolation=cv2.INTER_AREA)
+                # Lighter denoising for downsampled image
+                h_value = 7  # Strength parameter
+                template_window = 5
+                search_window = 13
+                # Apply denoising on smaller image
+                denoised_np = cv2.fastNlMeansDenoising(downsample, None, h_value, template_window, search_window)
+                # Resize back to original size
+                denoised_np = cv2.resize(denoised_np, (width, height), interpolation=cv2.INTER_LINEAR)
+            else:
+                # Direct denoising for medium-large images
+                h_value = 8  # Balanced for speed and quality
+                template_window = 5
+                search_window = 15
+                # Apply denoising
+                denoised_np = cv2.fastNlMeansDenoising(img_np, None, h_value, template_window, search_window)
+            # Convert back to PIL Image
+            enhanced = Image.fromarray(denoised_np)
+            # Apply adaptive thresholding only if it improves text visibility
+            # Create a binarized version of the image
+            if img_size < 8000000:  # Skip for extremely large images to save processing time
+                binary = cv2.adaptiveThreshold(denoised_np, 255,
+                                             cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                             cv2.THRESH_BINARY, 11, 2)
+                # Quick verification that binarization preserves text information
+                # Use simplified check that works well for document images
+                white_pixels_binary = np.count_nonzero(binary > 200)
+                white_pixels_orig = np.count_nonzero(denoised_np > 200)
+                # Check if binary preserves reasonable amount of white pixels (background)
+                if white_pixels_binary > white_pixels_orig * 0.8:
+                    # Binarization looks good, use it
+                    return Image.fromarray(binary)
+        except Exception as e:
+            # If OpenCV processing fails, continue with PIL-enhanced image
+            pass
+    elif IMAGE_PREPROCESSING["denoise"]:
+        # Fallback PIL denoising for systems without OpenCV
+        # Use lighter median filter
+        enhanced = enhanced.filter(ImageFilter.MedianFilter(3))
+    # Return enhanced grayscale image
+    return enhanced
+# Removed caching to fix unhashable type error
+def preprocess_general_image(img: Image.Image) -> Image.Image:
+    """
+    Preprocess a general image for OCR.
+    Args:
+        img: PIL Image object
+    Returns:
+        Processed PIL Image
+    """
+    # Store the image for implementation function
+    preprocess_general_image._current_img = img
+    return _preprocess_general_image_impl()
+def _preprocess_general_image_impl() -> Image.Image:
+    """
+    Optimized implementation of general image preprocessing with size-based processing paths
+    """
+    # Fast path: Get the image from thread-local storage
+    if not hasattr(preprocess_general_image, "_current_img"):
+        raise ValueError("No image set for general preprocessing")
+    img = preprocess_general_image._current_img
+    # Ultra-fast path: Skip processing completely for small images to improve performance
+    width, height = img.size
+    img_size = width * height
+    if img_size < 300000:  # Skip for tiny images under ~0.3 megapixel
+        # Just ensure correct color mode
+        if img.mode != 'RGB':
+            return img.convert('RGB')
+        return img
+    # Fast path: Minimal processing for smaller images
+    if img_size < 600000:  # ~800x750 or smaller
+        # Ensure RGB mode
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        # Very light contrast enhancement only
+        enhancer = ImageEnhance.Contrast(img)
+        return enhancer.enhance(1.15)  # Lighter enhancement for small images
+    # Standard path: Apply moderate enhancements for medium images
+    # Convert to RGB to ensure compatibility
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+    # Moderate enhancement only
+    enhancer = ImageEnhance.Contrast(img)
+    enhanced = enhancer.enhance(1.2)  # Less aggressive than document enhancement
+    # Skip additional processing for medium-sized images
+    if img_size < 1000000:  # Skip for images under ~1 megapixel
+        return enhanced
+    # Enhanced path: Additional processing for larger images
+    try:
+        # Apply optimized enhancement pipeline for large non-document images
+        # 1. Improve color saturation slightly for better feature extraction
+        saturation = ImageEnhance.Color(enhanced)
+        enhanced = saturation.enhance(1.1)
+        # 2. Apply adaptive sharpening based on image size
+        if img_size > 2500000:  # Very large images (~1600x1600 or larger)
+            # Use EDGE_ENHANCE instead of SHARPEN for more subtle enhancement on large images
+            enhanced = enhanced.filter(ImageFilter.EDGE_ENHANCE)
+        else:
+            # Standard sharpening for regular large images
+            enhanced = enhanced.filter(ImageFilter.SHARPEN)
+        # 3. Apply additional processing with OpenCV if available (for largest images)
+        if CV2_AVAILABLE and img_size > 3000000:
+            # Convert to numpy array
+            img_np = np.array(enhanced)
+            # Apply subtle enhancement of details (CLAHE)
+            try:
+                # Convert to LAB color space for better processing
+                lab = cv2.cvtColor(img_np, cv2.COLOR_RGB2LAB)
+                # Only enhance the L channel (luminance)
+                l, a, b = cv2.split(lab)
+                # Create CLAHE object with optimal parameters for photos
+                clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+                # Apply CLAHE to L channel
+                l = clahe.apply(l)
+                # Merge channels back and convert to RGB
+                lab = cv2.merge((l, a, b))
+                enhanced_np = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
+                # Convert back to PIL
+                enhanced = Image.fromarray(enhanced_np)
+            except:
+                # If CLAHE fails, continue with PIL-enhanced image
+                pass
+    except Exception:
+        # If any enhancement fails, fall back to basic contrast enhancement
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        enhancer = ImageEnhance.Contrast(img)
+        enhanced = enhancer.enhance(1.2)
+    return enhanced
+# Removed caching decorator to fix unhashable type error
+def resize_image(img: Image.Image, target_dpi: int = 300) -> Image.Image:
+    """
+    Resize an image to an optimal size for OCR while preserving quality.
+    Args:
+        img: PIL Image object
+        target_dpi: Target DPI (dots per inch)
+    Returns:
+        Resized PIL Image
+    """
+    # Store the image for implementation function
+    resize_image._current_img = img
+    return resize_image_impl(target_dpi)
+def resize_image_impl(target_dpi: int = 300) -> Image.Image:
+    """
+    Implementation of resize function that uses thread-local storage.
+    Args:
+        target_dpi: Target DPI (dots per inch)
+    Returns:
+        Resized PIL Image
+    """
+    # Get the image from thread-local storage (set by the caller)
+    if not hasattr(resize_image, "_current_img"):
+        raise ValueError("No image set for resizing")
+    img = resize_image._current_img
+    # Calculate current dimensions
+    width, height = img.size
+    # Fixed target dimensions based on DPI
+    # Using 8.5x11 inches (standard paper size) as reference
+    max_width = int(8.5 * target_dpi)
+    max_height = int(11 * target_dpi)
+    # Check if resizing is needed - quick early return
+    if width <= max_width and height <= max_height:
+        return img  # No resizing needed
+    # Calculate scaling factor once
+    scale_factor = min(max_width / width, max_height / height)
+    # Calculate new dimensions
+    new_width = int(width * scale_factor)
+    new_height = int(height * scale_factor)
+    # Use BICUBIC for better balance of speed and quality
+    return img.resize((new_width, new_height), Image.BICUBIC)
+def calculate_image_entropy(img: Image.Image) -> float:
+    """
+    Calculate the entropy (information content) of an image.
+    Args:
+        img: PIL Image object
+    Returns:
+        Entropy value
+    """
+    # Convert to grayscale
+    if img.mode != 'L':
+        img = img.convert('L')
+    # Calculate histogram
+    histogram = img.histogram()
+    total_pixels = img.width * img.height
+    # Calculate entropy
+    entropy = 0
+    for h in histogram:
+        if h > 0:
+            probability = h / total_pixels
+            entropy -= probability * np.log2(probability)
+    return entropy
+def create_html_with_images(result):
+    """
+    Create an HTML document with embedded images from OCR results.
+    Args:
+        result: OCR result dictionary containing pages_data
+    Returns:
+        HTML content as string
+    """
+    # Create HTML document structure
+    html_content = """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>OCR Document with Images</title>
+        <style>
+            body {
+                font-family: Georgia, serif;
+                line-height: 1.7;
+                margin: 0 auto;
+                max-width: 800px;
+                padding: 20px;
+            }
+            img {
+                max-width: 90%;
+                max-height: 500px;
+                object-fit: contain;
+                margin: 20px auto;
+                display: block;
+                border: 1px solid #ddd;
+                border-radius: 4px;
+            }
+            .image-container {
+                margin: 20px 0;
+                text-align: center;
+            }
+            .page-break {
+                border-top: 1px solid #ddd;
+                margin: 40px 0;
+                padding-top: 40px;
+            }
+            h3 {
+                color: #333;
+                border-bottom: 1px solid #eee;
+                padding-bottom: 10px;
+            }
+            p {
+                margin: 12px 0;
+            }
+            .page-text-content {
+                margin-bottom: 20px;
+            }
+            .text-block {
+                background-color: #f9f9f9;
+                padding: 15px;
+                border-radius: 4px;
+                border-left: 3px solid #546e7a;
+                margin-bottom: 15px;
+                color: #333;
+            }
+            .text-block p {
+                margin: 8px 0;
+                color: #333;
+            }
+            .metadata {
+                background-color: #f5f5f5;
+                padding: 10px 15px;
+                border-radius: 4px;
+                margin-bottom: 20px;
+                font-size: 14px;
+            }
+            .metadata p {
+                margin: 5px 0;
+            }
+        </style>
+    </head>
+    <body>
+    """
+    # Add document metadata
+    html_content += f"""
+    <div class="metadata">
+        <h2>{result.get('file_name', 'Document')}</h2>
+        <p><strong>Processed at:</strong> {result.get('timestamp', '')}</p>
+        <p><strong>Languages:</strong> {', '.join(result.get('languages', ['Unknown']))}</p>
+        <p><strong>Topics:</strong> {', '.join(result.get('topics', ['Unknown']))}</p>
+    </div>
+    """
+    # Check if we have pages_data
+    if 'pages_data' in result and result['pages_data']:
+        pages_data = result['pages_data']
+        # Process each page
+        for i, page in enumerate(pages_data):
+            page_markdown = page.get('markdown', '')
+            images = page.get('images', [])
+            # Add page header if multi-page
+            if len(pages_data) > 1:
+                html_content += f"<h3>Page {i+1}</h3>"
+            # Create image dictionary
+            image_dict = {}
+            for img in images:
+                if 'id' in img and 'image_base64' in img:
+                    image_dict[img['id']] = img['image_base64']
+            # Process the markdown content
+            if page_markdown:
+                # Extract text content (lines without images)
+                text_content = []
+                image_lines = []
+                for line in page_markdown.split('\n'):
+                    if '![' in line and '](' in line:
+                        image_lines.append(line)
+                    elif line.strip():
+                        text_content.append(line)
+                # Add text content
+                if text_content:
+                    html_content += '<div class="text-block">'
+                    for line in text_content:
+                        html_content += f"<p>{line}</p>"
+                    html_content += '</div>'
+                # Add images
+                for line in image_lines:
+                    # Extract image ID and alt text using simple parsing
+                    try:
+                        alt_start = line.find('![') + 2
+                        alt_end = line.find(']', alt_start)
+                        alt_text = line[alt_start:alt_end]
+                        img_start = line.find('(', alt_end) + 1
+                        img_end = line.find(')', img_start)
+                        img_id = line[img_start:img_end]
+                        if img_id in image_dict:
+                            html_content += f'<div class="image-container">'
+                            html_content += f'<img src="{image_dict[img_id]}" alt="{alt_text}">'
+                            html_content += f'</div>'
+                    except:
+                        # If parsing fails, just skip this image
+                        continue
+            # Add page separator if not the last page
+            if i < len(pages_data) - 1:
+                html_content += '<div class="page-break"></div>'
+    # Add structured content if available
+    if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict):
+        html_content += '<h3>Structured Content</h3>'
+        for section, content in result['ocr_contents'].items():
+            if content and section not in ['error', 'raw_text', 'partial_text']:
+                html_content += f'<h4>{section.replace("_", " ").title()}</h4>'
+                if isinstance(content, str):
+                    html_content += f'<p>{content}</p>'
+                elif isinstance(content, list):
+                    html_content += '<ul>'
+                    for item in content:
+                        html_content += f'<li>{str(item)}</li>'
+                    html_content += '</ul>'
+                elif isinstance(content, dict):
+                    html_content += '<dl>'
+                    for k, v in content.items():
+                        html_content += f'<dt>{k}</dt><dd>{v}</dd>'
+                    html_content += '</dl>'
+    # Close HTML document
+    html_content += """
+    </body>
+    </html>
+    """
+    return html_content
+def generate_document_thumbnail(image_path: Union[str, Path], max_size: int = 300) -> str:
+    """
+    Generate a thumbnail for document preview.
+    Args:
+        image_path: Path to the image file
+        max_size: Maximum dimension for thumbnail
+    Returns:
+        Base64 encoded thumbnail
+    """
+    if not PILLOW_AVAILABLE:
+        return None
+    try:
+        # Open the image
+        with Image.open(image_path) as img:
+            # Calculate thumbnail size preserving aspect ratio
+            width, height = img.size
+            if width > height:
+                new_width = max_size
+                new_height = int(height * (max_size / width))
+            else:
+                new_height = max_size
+                new_width = int(width * (max_size / height))
+            # Create thumbnail
+            thumbnail = img.resize((new_width, new_height), Image.LANCZOS)
+            # Save to buffer
+            buffer = io.BytesIO()
+            thumbnail.save(buffer, format="JPEG", quality=85)
+            buffer.seek(0)
+            # Encode as base64
+            encoded = base64.b64encode(buffer.getvalue()).decode()
+            return f"data:image/jpeg;base64,{encoded}"
+    except Exception:
+        # Return None if thumbnail generation fails
+        return None
+def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
+    """
+    Attempt to use local pytesseract OCR as a fallback when API fails
+    Args:
+        image_path: Path to the image file
+        base64_data_url: Optional base64 data URL if already available
+    Returns:
+        OCR text string if successful, None if failed
+    """
+    logger.info("Attempting local OCR fallback using pytesseract...")
+    try:
+        import pytesseract
+        from PIL import Image
+        # Load image - either from path or from base64
+        if base64_data_url and base64_data_url.startswith('data:image'):
+            # Extract image from base64
+            image_data = base64_data_url.split(',', 1)[1]
+            image_bytes = base64.b64decode(image_data)
+            image = Image.open(io.BytesIO(image_bytes))
+        else:
+            # Load from file path
+            image_path = Path(image_path) if isinstance(image_path, str) else image_path
+            image = Image.open(image_path)
+        # Convert to RGB if not already (pytesseract works best with RGB)
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Apply image enhancements for better OCR
+        # Convert to grayscale for better text recognition
+        image = image.convert('L')
+        # Enhance contrast
+        enhancer = ImageEnhance.Contrast(image)
+        image = enhancer.enhance(2.0)  # Higher contrast for better OCR
+        # Run OCR
+        ocr_text = pytesseract.image_to_string(image, lang='eng')
+        if ocr_text and len(ocr_text.strip()) > 50:
+            logger.info(f"Local OCR successful: extracted {len(ocr_text)} characters")
+            return ocr_text
+        else:
+            logger.warning("Local OCR produced minimal or no text")
+            return None
+    except ImportError:
+        logger.warning("Pytesseract not installed - local OCR not available")
+        return None
+    except Exception as e:
+        logger.error(f"Local OCR fallback failed: {str(e)}")
+        return None

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ poppler-utils
2	+ tesseract-ocr

pdf_ocr.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+"""
+PDFOCR - Module for processing PDF files with OCR and extracting structured data.
+"""
+import json
+from pathlib import Path
+from structured_ocr import StructuredOCR
+class PDFOCR:
+    """Class for processing PDF files with OCR and extracting structured data."""
+    def __init__(self, api_key=None):
+        """Initialize the PDF OCR processor."""
+        self.processor = StructuredOCR(api_key=api_key)
+    def process_pdf(self, pdf_path, use_vision=True):
+        """
+        Process a PDF file with OCR and extract structured data.
+        Args:
+            pdf_path: Path to the PDF file
+            use_vision: Whether to use vision model for improved analysis
+        Returns:
+            Dictionary with structured OCR results
+        """
+        pdf_path = Path(pdf_path)
+        if not pdf_path.exists():
+            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+        return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
+    def save_json_output(self, pdf_path, output_path, use_vision=True):
+        """
+        Process a PDF file and save the structured output as JSON.
+        Args:
+            pdf_path: Path to the PDF file
+            output_path: Path where to save the JSON output
+            use_vision: Whether to use vision model for improved analysis
+        Returns:
+            Path to the saved JSON file
+        """
+        # Process the PDF
+        result = self.process_pdf(pdf_path, use_vision=use_vision)
+        # Save the result to JSON
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, 'w') as f:
+            json.dump(result, f, indent=2)
+        return output_path
+# For testing directly
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
+        sys.exit(1)
+    pdf_path = sys.argv[1]
+    output_path = sys.argv[2] if len(sys.argv) > 2 else None
+    processor = PDFOCR()
+    if output_path:
+        result_path = processor.save_json_output(pdf_path, output_path)
+        print(f"Results saved to: {result_path}")
+    else:
+        result = processor.process_pdf(pdf_path)
+        print(json.dumps(result, indent=2))

process_file.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Utility function for processing files with OCR in the Historical OCR Workshop app.
+"""
+import os
+import tempfile
+from pathlib import Path
+from datetime import datetime
+def process_file(uploaded_file, use_vision=True, processor=None, custom_prompt=None):
+    """Process the uploaded file and return the OCR results
+    Args:
+        uploaded_file: The uploaded file to process
+        use_vision: Whether to use vision model
+        processor: StructuredOCR processor (if None, it will be imported)
+        custom_prompt: Optional additional instructions for the model
+    Returns:
+        dict: The OCR results
+    """
+    # Import the processor if not provided
+    if processor is None:
+        from structured_ocr import StructuredOCR
+        processor = StructuredOCR()
+    # Save the uploaded file to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+        tmp.write(uploaded_file.getvalue())
+        temp_path = tmp.name
+    try:
+        # Determine file type from extension
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        file_type = "pdf" if file_ext == ".pdf" else "image"
+        # Get file size in MB
+        file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
+        # Process the file with file size information for automatic page limiting
+        result = processor.process_file(
+            temp_path,
+            file_type=file_type,
+            use_vision=use_vision,
+            file_size_mb=file_size_mb,
+            custom_prompt=custom_prompt
+        )
+        # Add processing metadata
+        result.update({
+            "file_name": uploaded_file.name,
+            "processed_at": datetime.now().isoformat(),
+            "file_size_mb": round(file_size_mb, 2),
+            "use_vision": use_vision
+        })
+        # No longer needed - removing confidence score
+        return result
+    except Exception as e:
+        return {
+            "error": str(e),
+            "file_name": uploaded_file.name
+        }
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# Generated requirements for Hugging Face Spaces deployment
+streamlit>=1.28.0
+mistralai>=0.0.3
+Pillow>=9.0.0
+opencv-python-headless>=4.5.0
+pdf2image>=1.16.0
+python-dotenv>=0.19.0
+pycountry>=22.1.10
+pydantic>=1.9.0
+numpy>=1.20.0
+requests>=2.28.0
+# Additional packages from original requirements
+pillow>=10.0.0
+python-multipart>=0.0.6
+pytesseract>=0.3.10

static/favicon.ico ADDED Viewed

static/favicon.png ADDED Viewed

Git LFS Details

SHA256: 579585886ddea743aa3e212e698632f315c6130d5d6dd3287a015011dbb8fc3a
Pointer size: 128 Bytes
Size of remote file: 779 Bytes

static/scroll.svg ADDED Viewed

structured_ocr.py ADDED Viewed

	@@ -0,0 +1,1718 @@

+import os
+import sys
+import time
+import random
+from enum import Enum
+from pathlib import Path
+import json
+import base64
+import pycountry
+import logging
+from functools import lru_cache
+from typing import Optional, Dict, Any, List, Union, Tuple
+from pydantic import BaseModel
+from mistralai import Mistral
+from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
+from mistralai.models import OCRImageObject
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Import utilities for OCR processing
+try:
+    from ocr_utils import replace_images_in_markdown, get_combined_markdown
+except ImportError:
+    # Define fallback functions if module not found
+    def replace_images_in_markdown(markdown_str, images_dict):
+        for img_name, base64_str in images_dict.items():
+            markdown_str = markdown_str.replace(
+                f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
+            )
+        return markdown_str
+    def get_combined_markdown(ocr_response):
+        markdowns = []
+        for page in ocr_response.pages:
+            image_data = {}
+            for img in page.images:
+                image_data[img.id] = img.image_base64
+            markdowns.append(replace_images_in_markdown(page.markdown, image_data))
+        return "\n\n".join(markdowns)
+# Import config directly (now local to historical-ocr)
+from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL, TEST_MODE
+# Helper function to make OCR objects JSON serializable
+# Removed caching to fix unhashable type error
+def serialize_ocr_response(obj):
+    """
+    Convert OCR response objects to JSON serializable format
+    Optimized for speed and memory usage
+    """
+    # Fast path: Handle primitive types directly
+    if obj is None or isinstance(obj, (str, int, float, bool)):
+        return obj
+    # Handle collections with optimized recursion
+    if isinstance(obj, list):
+        return [serialize_ocr_response(item) for item in obj]
+    elif isinstance(obj, dict):
+        return {k: serialize_ocr_response(v) for k, v in obj.items()}
+    elif hasattr(obj, '__dict__'):
+        # For OCR objects with __dict__ attribute
+        result = {}
+        for key, value in obj.__dict__.items():
+            if key.startswith('_'):
+                continue  # Skip private attributes
+            # Fast path for OCRImageObject - most common complex object
+            if isinstance(value, OCRImageObject):
+                # Special handling for OCRImageObject with direct attribute access
+                result[key] = {
+                    'id': value.id if hasattr(value, 'id') else None,
+                    'image_base64': value.image_base64 if hasattr(value, 'image_base64') else None
+                }
+            # Handle collections
+            elif isinstance(value, list):
+                result[key] = [serialize_ocr_response(item) for item in value]
+            # Handle nested objects
+            elif hasattr(value, '__dict__'):
+                result[key] = serialize_ocr_response(value)
+            # Handle primitives and other types
+            else:
+                result[key] = value
+        return result
+    else:
+        return obj
+# Create language enum for structured output - cache language lookup to avoid repeated processing
+@lru_cache(maxsize=1)
+def get_language_dict():
+    return {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
+class LanguageMeta(Enum.__class__):
+    def __new__(metacls, cls, bases, classdict):
+        languages = get_language_dict()
+        for code, name in languages.items():
+            classdict[name.upper().replace(' ', '_')] = name
+        return super().__new__(metacls, cls, bases, classdict)
+class Language(Enum, metaclass=LanguageMeta):
+    pass
+class StructuredOCRModel(BaseModel):
+    file_name: str
+    topics: list[str]
+    languages: list[Language]
+    ocr_contents: dict
+class StructuredOCR:
+    def __init__(self, api_key=None):
+        """Initialize the OCR processor with API key"""
+        # Check if we're running in test mode
+        self.test_mode = TEST_MODE
+        # Initialize API key - use provided key, or environment var
+        if self.test_mode and not api_key:
+            self.api_key = "placeholder_key"
+        else:
+            self.api_key = api_key or MISTRAL_API_KEY
+        # Ensure we have a valid API key when not in test mode
+        if not self.api_key and not self.test_mode:
+            raise ValueError("No Mistral API key provided. Please set the MISTRAL_API_KEY environment variable or enable TEST_MODE.")
+        # Clean the API key by removing any whitespace
+        self.api_key = self.api_key.strip()
+        # Check if API key exists but don't enforce length requirements
+        if not self.test_mode and not self.api_key:
+            logger = logging.getLogger("api_validator")
+            logger.warning("Warning: No API key provided")
+        # Initialize client with the API key
+        try:
+            self.client = Mistral(api_key=self.api_key)
+            # Skip validation to avoid unnecessary API calls
+        except Exception as e:
+            error_msg = str(e).lower()
+            if "unauthorized" in error_msg or "401" in error_msg:
+                raise ValueError(f"API key authentication failed. Please check your Mistral API key: {str(e)}")
+            else:
+                raise
+    def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None, custom_prompt=None):
+        """Process a file and return structured OCR results
+        Args:
+            file_path: Path to the file to process
+            file_type: 'pdf' or 'image' (will be auto-detected if None)
+            use_vision: Whether to use vision model for improved analysis
+            max_pages: Optional limit on number of pages to process
+            file_size_mb: Optional file size in MB (used for automatic page limiting)
+            custom_pages: Optional list of specific page numbers to process
+            custom_prompt: Optional instructions for the AI to handle unusual document formatting or specific extraction needs
+        Returns:
+            Dictionary with structured OCR results
+        """
+        # Convert file_path to Path object if it's a string
+        file_path = Path(file_path)
+        # Auto-detect file type if not provided
+        if file_type is None:
+            suffix = file_path.suffix.lower()
+            file_type = "pdf" if suffix == ".pdf" else "image"
+        # Get file size if not provided
+        if file_size_mb is None and file_path.exists():
+            file_size_mb = file_path.stat().st_size / (1024 * 1024)  # Convert bytes to MB
+        # Check if file exceeds API limits (50 MB)
+        if file_size_mb and file_size_mb > 50:
+            logging.warning(f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB")
+            return {
+                "file_name": file_path.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "confidence_score": 0.0,
+                "error": f"File size {file_size_mb:.2f} MB exceeds API limit of 50 MB",
+                "ocr_contents": {
+                    "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                    "partial_text": "Document could not be processed due to size limitations."
+                }
+            }
+        # For PDF files, limit pages based on file size if no explicit limit is given
+        if file_type == "pdf" and file_size_mb and max_pages is None and custom_pages is None:
+            if file_size_mb > 100:  # Very large files
+                max_pages = 3
+            elif file_size_mb > 50:  # Large files
+                max_pages = 5
+            elif file_size_mb > 20:  # Medium files
+                max_pages = 10
+            else:  # Small files
+                max_pages = None  # Process all pages
+        # Start processing timer
+        start_time = time.time()
+        # Read and process the file
+        if file_type == "pdf":
+            result = self._process_pdf(file_path, use_vision, max_pages, custom_pages, custom_prompt)
+        else:
+            result = self._process_image(file_path, use_vision, custom_prompt)
+        # Add processing time information
+        processing_time = time.time() - start_time
+        result['processing_time'] = processing_time
+        # Add a default confidence score if not present
+        if 'confidence_score' not in result:
+            result['confidence_score'] = 0.85  # Default confidence
+        # Ensure the entire result is fully JSON serializable by running it through our serializer
+        try:
+            # First convert to a standard dict if it's not already
+            if not isinstance(result, dict):
+                result = serialize_ocr_response(result)
+            # Make a final pass to check for any remaining non-serializable objects
+            # Test JSON serialization to catch any remaining issues
+            json.dumps(result)
+        except TypeError as e:
+            # If there's a serialization error, run the whole result through our serializer
+            logger = logging.getLogger("serializer")
+            logger.warning(f"JSON serialization error in result: {str(e)}. Applying full serialization.")
+            result = serialize_ocr_response(result)
+        return result
+    def _process_pdf(self, file_path, use_vision=True, max_pages=None, custom_pages=None, custom_prompt=None):
+        """
+        Process a PDF file with OCR - optimized version with smart page handling and memory management
+        Args:
+            file_path: Path to the PDF file
+            use_vision: Whether to use vision model for enhanced analysis
+            max_pages: Optional limit on the number of pages to process
+            custom_pages: Optional list of specific page numbers to process
+            custom_prompt: Optional custom prompt for specialized extraction
+        """
+        logger = logging.getLogger("pdf_processor")
+        logger.info(f"Processing PDF: {file_path}")
+        # Track processing time
+        start_time = time.time()
+        # Fast path: Return placeholder if in test mode
+        if self.test_mode:
+            logger.info("Test mode active, returning placeholder response")
+            # Enhanced test mode placeholder that's more realistic
+            return {
+                "file_name": file_path.name,
+                "topics": ["Historical Document", "Literature", "American History"],
+                "languages": ["English"],
+                "ocr_contents": {
+                    "title": "Harper's New Monthly Magazine",
+                    "publication_date": "1855",
+                    "publisher": "Harper & Brothers, New York",
+                    "raw_text": "This is a test mode placeholder for Harper's New Monthly Magazine from 1855. The actual document contains articles on literature, politics, science, and culture from mid-19th century America.",
+                    "content": "The magazine includes various literary pieces, poetry, political commentary, and illustrations typical of 19th century periodicals. Known for publishing works by prominent American authors including Herman Melville and Charles Dickens.",
+                    "key_figures": ["Herman Melville", "Charles Dickens", "Henry Wadsworth Longfellow"],
+                    "noted_articles": ["Continued serialization of popular novels", "Commentary on contemporary political events", "Scientific discoveries and technological advancements"]
+                },
+                "pdf_processing_method": "enhanced_test_mode",
+                "total_pages": 12,
+                "processed_pages": 3,
+                "processing_time": 0.5,
+                "confidence_score": 0.9
+            }
+        try:
+            # PDF processing strategy decision based on file size
+            file_size_mb = file_path.stat().st_size / (1024 * 1024)
+            logger.info(f"PDF size: {file_size_mb:.2f} MB")
+            # Always use pdf2image for better control and consistency across all PDF files
+            use_pdf2image = True
+            # First try local PDF processing for better performance and control
+            if use_pdf2image:
+                try:
+                    import tempfile
+                    from pdf2image import convert_from_path
+                    logger.info("Processing PDF using pdf2image for better multi-page handling")
+                    # Convert PDF to images with optimized parameters
+                    conversion_start = time.time()
+                    # Use consistent DPI for all files to ensure reliable results
+                    dpi = 200  # Higher quality DPI for all files to ensure better text recognition
+                    # Only convert first page initially to check document type
+                    pdf_first_page = convert_from_path(file_path, dpi=dpi, first_page=1, last_page=1)
+                    logger.info(f"First page converted in {time.time() - conversion_start:.2f}s")
+                    # Quick check if PDF has readable content
+                    if not pdf_first_page:
+                        logger.warning("PDF conversion produced no images, falling back to API")
+                        raise Exception("PDF conversion failed to produce images")
+                    # Determine total pages in the document
+                    # First, try simple estimate from first page conversion
+                    total_pages = 1
+                    # Try pdf2image info extraction
+                    try:
+                        # Try with pdf2image page counting - use simpler parameters
+                        logger.info("Determining PDF page count...")
+                        count_start = time.time()
+                        # Use a lightweight approach with multi-threading for faster processing
+                        pdf_info = convert_from_path(
+                            file_path,
+                            dpi=72,  # Low DPI just for info
+                            first_page=1,
+                            last_page=1,
+                            size=(100, 100),  # Tiny image to save memory
+                            fmt="jpeg",
+                            thread_count=4,  # Increased thread count for faster processing
+                            output_file=None
+                        )
+                        # Extract page count
+                        if hasattr(pdf_info, 'n_pages'):
+                            total_pages = pdf_info.n_pages
+                        elif isinstance(pdf_info, dict) and "Pages" in pdf_info:
+                            total_pages = int(pdf_info.get("Pages", "1"))
+                        elif len(pdf_first_page) > 0:
+                            # Just estimate based on first page - at least we have one
+                            total_pages = 1
+                        logger.info(f"Page count determined in {time.time() - count_start:.2f}s")
+                    except Exception as count_error:
+                        logger.warning(f"Error determining page count: {str(count_error)}. Using default of 1")
+                        total_pages = 1
+                    logger.info(f"PDF has {total_pages} total pages")
+                    # Determine which pages to process
+                    pages_to_process = []
+                    # Handle custom page selection if provided
+                    if custom_pages and any(0 < p <= total_pages for p in custom_pages):
+                        # Filter valid page numbers
+                        pages_to_process = [p for p in custom_pages if 0 < p <= total_pages]
+                        logger.info(f"Processing {len(pages_to_process)} custom-selected pages: {pages_to_process}")
+                    # Otherwise use max_pages limit if provided
+                    elif max_pages and max_pages < total_pages:
+                        pages_to_process = list(range(1, max_pages + 1))
+                        logger.info(f"Processing first {max_pages} pages of {total_pages} total")
+                    # Or process all pages if reasonable count
+                    elif total_pages <= 10:
+                        pages_to_process = list(range(1, total_pages + 1))
+                        logger.info(f"Processing all {total_pages} pages")
+                    # For large documents without limits, process subset of pages
+                    else:
+                        # Smart sampling: first page, last page, and some pages in between
+                        pages_to_process = [1]  # Always include first page
+                        if total_pages > 1:
+                            if total_pages <= 5:
+                                # For few pages, process all
+                                pages_to_process = list(range(1, total_pages + 1))
+                            else:
+                                # For many pages, sample intelligently
+                                # Add pages from the middle of the document
+                                middle = total_pages // 2
+                                # Add last page if more than 3 pages
+                                if total_pages > 3:
+                                    pages_to_process.append(total_pages)
+                                # Add up to 3 pages from middle if document is large
+                                if total_pages > 5:
+                                    pages_to_process.append(middle)
+                                if total_pages > 10:
+                                    pages_to_process.append(middle // 2)
+                                    pages_to_process.append(middle + (middle // 2))
+                        # Sort pages for sequential processing
+                        pages_to_process = sorted(list(set(pages_to_process)))
+                        logger.info(f"Processing {len(pages_to_process)} sampled pages out of {total_pages} total: {pages_to_process}")
+                    # Convert only the selected pages to minimize memory usage
+                    selected_images = []
+                    combined_text = []
+                    # Process pages in larger batches for better efficiency
+                    batch_size = 5  # Process 5 pages at a time for better throughput
+                    for i in range(0, len(pages_to_process), batch_size):
+                        batch_pages = pages_to_process[i:i+batch_size]
+                        logger.info(f"Converting batch of pages {batch_pages}")
+                        # Convert batch of pages with multi-threading for better performance
+                        batch_start = time.time()
+                        batch_images = convert_from_path(
+                            file_path,
+                            dpi=dpi,
+                            first_page=min(batch_pages),
+                            last_page=max(batch_pages),
+                            thread_count=4,  # Use multi-threading for faster PDF processing
+                            fmt="jpeg"       # Use JPEG format for better compatibility
+                        )
+                        logger.info(f"Batch conversion completed in {time.time() - batch_start:.2f}s")
+                        # Map converted images to requested page numbers
+                        for idx, page_num in enumerate(range(min(batch_pages), max(batch_pages) + 1)):
+                            if page_num in pages_to_process and idx < len(batch_images):
+                                if page_num == pages_to_process[0]:  # First page to process
+                                    selected_images.append(batch_images[idx])
+                                # Process each page individually
+                                with tempfile.NamedTemporaryFile(suffix='.jpeg', delete=False) as tmp:
+                                    batch_images[idx].save(tmp.name, format='JPEG')
+                                    # Simple OCR to extract text
+                                    try:
+                                        page_result = self._process_image(Path(tmp.name), False, None)
+                                        if 'ocr_contents' in page_result and 'raw_text' in page_result['ocr_contents']:
+                                            # Add page text to combined text
+                                            page_text = page_result['ocr_contents']['raw_text']
+                                            combined_text.append(f"--- PAGE {page_num} ---\n{page_text}")
+                                    except Exception as page_e:
+                                        logger.warning(f"Error processing page {page_num}: {str(page_e)}")
+                                    # Clean up temp file
+                                    import os
+                                    os.unlink(tmp.name)
+                    # If we have processed pages
+                    if selected_images and combined_text:
+                        # Save first image to temp file for vision model
+                        with tempfile.NamedTemporaryFile(suffix='.jpeg', delete=False) as tmp:
+                            selected_images[0].save(tmp.name, format='JPEG', quality=95)
+                            first_image_path = tmp.name
+                        # Combine all extracted text
+                        all_text = "\n\n".join(combined_text)
+                        # For custom prompts, use specialized processing
+                        if custom_prompt:
+                            try:
+                                # Process image with vision model
+                                result = self._process_image(Path(first_image_path), use_vision, None)
+                                # Enhance with text analysis using combined text from all pages
+                                enhanced_result = self._extract_structured_data_text_only(all_text, file_path.name, custom_prompt)
+                                # Merge results, keeping images from original result
+                                for key, value in enhanced_result.items():
+                                    if key not in ('raw_response_data', 'pages_data', 'has_images'):
+                                        result[key] = value
+                                # Update raw text with full document text
+                                if 'ocr_contents' in result:
+                                    result['ocr_contents']['raw_text'] = all_text
+                            except Exception as e:
+                                logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
+                                # Fall back to standard processing
+                                result = self._process_image(Path(first_image_path), use_vision, None)
+                                if 'ocr_contents' in result:
+                                    result['ocr_contents']['raw_text'] = all_text
+                        else:
+                            # Standard processing with combined text
+                            result = self._process_image(Path(first_image_path), use_vision, None)
+                            if 'ocr_contents' in result:
+                                result['ocr_contents']['raw_text'] = all_text
+                        # Add PDF metadata
+                        result['file_name'] = file_path.name
+                        result['pdf_processing_method'] = 'pdf2image_optimized'
+                        result['total_pages'] = total_pages
+                        result['processed_pages'] = len(pages_to_process)
+                        result['pages_processed'] = pages_to_process
+                        # Add processing info
+                        result['processing_info'] = {
+                            'method': 'local_pdf_processing',
+                            'dpi': dpi,
+                            'pages_sampled': pages_to_process,
+                            'processing_time': time.time() - start_time
+                        }
+                        # Clean up
+                        os.unlink(first_image_path)
+                        return result
+                    else:
+                        logger.warning("No pages successfully processed with pdf2image, falling back to API")
+                        raise Exception("Failed to process PDF pages locally")
+                except Exception as pdf2image_error:
+                    logger.warning(f"Local PDF processing failed, falling back to API: {str(pdf2image_error)}")
+                    # Fall back to API processing
+            # API-based PDF processing
+            logger.info("Processing PDF via Mistral API")
+            # Optimize file upload for faster processing
+            logger.info("Uploading PDF file to Mistral API")
+            upload_start = time.time()
+            # Set appropriate timeout based on file size
+            upload_timeout = max(60, min(300, int(file_size_mb * 5)))  # 60s to 300s based on size
+            try:
+                # Upload the file (Mistral client doesn't support timeout parameter for upload)
+                uploaded_file = self.client.files.upload(
+                    file={
+                        "file_name": file_path.stem,
+                        "content": file_path.read_bytes(),
+                    },
+                    purpose="ocr"
+                )
+                logger.info(f"PDF uploaded in {time.time() - upload_start:.2f}s")
+                # Get a signed URL for the uploaded file
+                signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
+                # Process the PDF with OCR - use adaptive timeout based on file size
+                logger.info(f"Processing PDF with OCR using {OCR_MODEL}")
+                # Adaptive retry strategy based on file size
+                max_retries = 3 if file_size_mb < 20 else 2  # Fewer retries for large files
+                base_retry_delay = 1 if file_size_mb < 10 else 2  # Longer delays for large files
+                # Adaptive timeout based on file size
+                ocr_timeout_ms = min(180000, max(60000, int(file_size_mb * 3000)))  # 60s to 180s
+                # Try processing with retries
+                for retry in range(max_retries):
+                    try:
+                        ocr_start = time.time()
+                        pdf_response = self.client.ocr.process(
+                            document=DocumentURLChunk(document_url=signed_url.url),
+                            model=OCR_MODEL,
+                            include_image_base64=True,
+                            timeout_ms=ocr_timeout_ms
+                        )
+                        logger.info(f"PDF OCR processing completed in {time.time() - ocr_start:.2f}s")
+                        break  # Success, exit retry loop
+                    except Exception as e:
+                        error_msg = str(e)
+                        logger.warning(f"API error on attempt {retry+1}/{max_retries}: {error_msg}")
+                        # Handle errors with optimized retry logic
+                        error_lower = error_msg.lower()
+                        # Authentication errors - no point in retrying
+                        if any(term in error_lower for term in ["unauthorized", "401", "403", "authentication"]):
+                            logger.error("API authentication failed. Check your API key.")
+                            raise ValueError(f"Authentication failed. Please verify your Mistral API key: {error_msg}")
+                        # Connection or server errors - worth retrying
+                        elif any(term in error_lower for term in ["connection", "timeout", "520", "server error", "502", "503", "504"]):
+                            if retry < max_retries - 1:
+                                # Exponential backoff with jitter for better retry behavior
+                                wait_time = base_retry_delay * (2 ** retry) * (0.8 + 0.4 * random.random())
+                                logger.info(f"Connection issue detected. Waiting {wait_time:.1f}s before retry...")
+                                time.sleep(wait_time)
+                            else:
+                                # Last retry failed
+                                logger.error("Maximum retries reached, API connection error persists.")
+                                raise ValueError(f"Could not connect to Mistral API after {max_retries} attempts: {error_msg}")
+                        # Rate limit errors - much longer wait
+                        elif any(term in error_lower for term in ["rate limit", "429", "too many requests", "requests rate limit exceeded"]):
+                            # Check specifically for token exhaustion vs temporary rate limit
+                            if "quota" in error_lower or "credit" in error_lower or "subscription" in error_lower:
+                                logger.error("API quota or credit limit reached. No retry will help.")
+                                raise ValueError(f"Mistral API quota or credit limit reached. Please check your subscription: {error_msg}")
+                            elif retry < max_retries - 1:
+                                wait_time = base_retry_delay * (2 ** retry) * 6.0  # Significantly longer wait for rate limits
+                                logger.info(f"Rate limit exceeded. Waiting {wait_time:.1f}s before retry...")
+                                time.sleep(wait_time)
+                            else:
+                                logger.error("Maximum retries reached, rate limit error persists.")
+                                raise ValueError(f"API rate limit exceeded. Please try again later: {error_msg}")
+                        # Misc errors - typically no retry will help
+                        else:
+                            if retry < max_retries - 1 and any(term in error_lower for term in ["transient", "temporary"]):
+                                # Only retry for errors explicitly marked as transient
+                                wait_time = base_retry_delay * (2 ** retry)
+                                logger.info(f"Transient error detected. Waiting {wait_time:.1f}s before retry...")
+                                time.sleep(wait_time)
+                            else:
+                                logger.error(f"Unrecoverable API error: {error_msg}")
+                                raise
+                # Calculate the number of pages to process
+                pages_to_process = pdf_response.pages
+                total_pages = len(pdf_response.pages)
+                limited_pages = False
+                logger.info(f"API returned {total_pages} total PDF pages")
+                # Smart page selection logic for better performance
+                if custom_pages:
+                    # Convert to 0-based indexing and filter valid page numbers
+                    valid_indices = [i-1 for i in custom_pages if 0 < i <= total_pages]
+                    if valid_indices:
+                        pages_to_process = [pdf_response.pages[i] for i in valid_indices]
+                        limited_pages = True
+                        logger.info(f"Processing {len(valid_indices)} custom-selected pages")
+                # Max pages limit with smart sampling
+                elif max_pages and total_pages > max_pages:
+                    if max_pages == 1:
+                        # Just first page
+                        pages_to_process = pages_to_process[:1]
+                    elif max_pages < 5 and total_pages > 10:
+                        # For small max_pages on large docs, include first, last, and middle
+                        indices = [0]  # First page
+                        if max_pages > 1:
+                            indices.append(total_pages - 1)  # Last page
+                        if max_pages > 2:
+                            indices.append(total_pages // 2)  # Middle page
+                        # Add more pages up to max_pages if needed
+                        if max_pages > 3:
+                            remaining = max_pages - len(indices)
+                            step = total_pages // (remaining + 1)
+                            for i in range(1, remaining + 1):
+                                idx = i * step
+                                if idx not in indices and 0 <= idx < total_pages:
+                                    indices.append(idx)
+                        indices.sort()
+                        pages_to_process = [pdf_response.pages[i] for i in indices]
+                    else:
+                        # Default: first max_pages
+                        pages_to_process = pages_to_process[:max_pages]
+                    limited_pages = True
+                    logger.info(f"Processing {len(pages_to_process)} pages out of {total_pages} total")
+                # Calculate confidence score if available
+                try:
+                    confidence_values = [page.confidence for page in pages_to_process if hasattr(page, 'confidence')]
+                    confidence_score = sum(confidence_values) / len(confidence_values) if confidence_values else 0.89
+                except Exception:
+                    confidence_score = 0.89  # Improved default
+                # Merge page content intelligently - include page numbers for better context
+                all_markdown = []
+                for idx, page in enumerate(pages_to_process):
+                    # Try to determine actual page number
+                    if custom_pages and len(custom_pages) == len(pages_to_process):
+                        page_num = custom_pages[idx]
+                    else:
+                        # Estimate page number - may not be accurate with sampling
+                        page_num = idx + 1
+                    page_markdown = page.markdown if hasattr(page, 'markdown') else ""
+                    # Add page header if content exists
+                    if page_markdown.strip():
+                        all_markdown.append(f"--- PAGE {page_num} ---\n{page_markdown}")
+                # Join all pages with separation
+                combined_markdown = "\n\n".join(all_markdown)
+                # Extract structured data with the appropriate model
+                if use_vision:
+                    # Try to get a good image for vision model
+                    vision_image = None
+                    # Try first page with images
+                    for page in pages_to_process:
+                        if hasattr(page, 'images') and page.images:
+                            vision_image = page.images[0].image_base64
+                            break
+                    if vision_image:
+                        # Use vision model with enhanced prompt
+                        logger.info(f"Using vision model: {VISION_MODEL}")
+                        result = self._extract_structured_data_with_vision(
+                            vision_image, combined_markdown, file_path.name, custom_prompt
+                        )
+                    else:
+                        # Fall back to text-only if no images available
+                        logger.info(f"No images in PDF, falling back to text model: {TEXT_MODEL}")
+                        result = self._extract_structured_data_text_only(
+                            combined_markdown, file_path.name, custom_prompt
+                        )
+                else:
+                    # Use text-only model as requested
+                    logger.info(f"Using text-only model as specified: {TEXT_MODEL}")
+                    result = self._extract_structured_data_text_only(
+                        combined_markdown, file_path.name, custom_prompt
+                    )
+                # Add metadata about pages
+                if limited_pages:
+                    result['limited_pages'] = {
+                        'processed': len(pages_to_process),
+                        'total': total_pages
+                    }
+                # Set confidence score from OCR
+                result['confidence_score'] = confidence_score
+                # Add processing method info
+                result['pdf_processing_method'] = 'api'
+                result['total_pages'] = total_pages
+                result['processed_pages'] = len(pages_to_process)
+                # Store serialized OCR response for rendering
+                serialized_response = serialize_ocr_response(pdf_response)
+                result['raw_response_data'] = serialized_response
+                # Check if there are images to include
+                has_images = hasattr(pdf_response, 'pages') and any(
+                    hasattr(page, 'images') and page.images for page in pdf_response.pages
+                )
+                result['has_images'] = has_images
+                # Include image data for rendering if available
+                if has_images:
+                    # Prepare pages data with image references
+                    result['pages_data'] = []
+                    # Get serialized pages - handle different formats
+                    serialized_pages = None
+                    try:
+                        if hasattr(serialized_response, 'pages'):
+                            serialized_pages = serialized_response.pages
+                        elif isinstance(serialized_response, dict) and 'pages' in serialized_response:
+                            serialized_pages = serialized_response.get('pages', [])
+                        else:
+                            # No pages found in response
+                            logger.warning("No pages found in OCR response")
+                            serialized_pages = []
+                    except Exception as pages_err:
+                        logger.warning(f"Error extracting pages from OCR response: {str(pages_err)}")
+                        serialized_pages = []
+                    # Process each page to extract images
+                    for page_idx, page in enumerate(serialized_pages):
+                        try:
+                            # Skip processing pages not in our selection
+                            if limited_pages and page_idx >= len(pages_to_process):
+                                continue
+                            # Extract page data with careful error handling
+                            markdown = ""
+                            images = []
+                            # Handle different page formats safely
+                            if isinstance(page, dict):
+                                markdown = page.get('markdown', '')
+                                images = page.get('images', [])
+                            else:
+                                # Try attribute access
+                                if hasattr(page, 'markdown'):
+                                    markdown = page.markdown
+                                if hasattr(page, 'images'):
+                                    images = page.images
+                            # Create page data record
+                            page_data = {
+                                'page_number': page_idx + 1,
+                                'markdown': markdown,
+                                'images': []
+                            }
+                            # Process images with careful error handling
+                            for img_idx, img in enumerate(images):
+                                try:
+                                    # Extract image ID and base64 data
+                                    img_id = None
+                                    img_base64 = None
+                                    if isinstance(img, dict):
+                                        img_id = img.get('id')
+                                        img_base64 = img.get('image_base64')
+                                    else:
+                                        # Try attribute access
+                                        if hasattr(img, 'id'):
+                                            img_id = img.id
+                                        if hasattr(img, 'image_base64'):
+                                            img_base64 = img.image_base64
+                                    # Only add if we have valid image data
+                                    if img_base64 and isinstance(img_base64, str):
+                                        # Ensure ID exists
+                                        safe_id = img_id if img_id else f"img_{page_idx}_{img_idx}"
+                                        page_data['images'].append({
+                                            'id': safe_id,
+                                            'image_base64': img_base64
+                                        })
+                                except Exception as img_err:
+                                    logger.warning(f"Error processing image {img_idx} on page {page_idx+1}: {str(img_err)}")
+                                    continue  # Skip this image
+                            # Add page data if it has content
+                            if page_data['markdown'] or page_data['images']:
+                                result['pages_data'].append(page_data)
+                        except Exception as page_err:
+                            logger.warning(f"Error processing page {page_idx+1}: {str(page_err)}")
+                            continue  # Skip this page
+                # Record final processing time
+                total_time = time.time() - start_time
+                result['processing_time'] = total_time
+                logger.info(f"PDF API processing completed in {total_time:.2f}s")
+                return result
+            except Exception as api_e:
+                logger.error(f"Error in API-based PDF processing: {str(api_e)}")
+                # Re-raise to be caught by outer exception handler
+                raise
+        except Exception as e:
+            # Log the error and return a helpful error result
+            logger.error(f"Error processing PDF: {str(e)}")
+            # Return basic result on error
+            return {
+                "file_name": file_path.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "confidence_score": 0.0,
+                "error": str(e),
+                "ocr_contents": {
+                    "error": f"Failed to process PDF: {str(e)}",
+                    "partial_text": "Document could not be fully processed."
+                },
+                "processing_time": time.time() - start_time
+            }
+    def _process_image(self, file_path, use_vision=True, custom_prompt=None):
+        """Process an image file with OCR"""
+        logger = logging.getLogger("image_processor")
+        logger.info(f"Processing image: {file_path}")
+        # Check if we're in test mode
+        if self.test_mode:
+            # Return a placeholder document response
+            return {
+                "file_name": file_path.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "ocr_contents": {
+                    "title": "Document",
+                    "content": "Please set up API key to process documents."
+                },
+                "processing_time": 0.5,
+                "confidence_score": 0.0
+            }
+        try:
+            # Check file size
+            file_size_mb = file_path.stat().st_size / (1024 * 1024)
+            logger.info(f"Original image size: {file_size_mb:.2f} MB")
+            # Use enhanced preprocessing functions from ocr_utils
+            try:
+                from ocr_utils import preprocess_image_for_ocr, IMAGE_PREPROCESSING
+                logger.info(f"Applying advanced image preprocessing for OCR")
+                # Get preprocessing settings from config
+                max_size_mb = IMAGE_PREPROCESSING.get("max_size_mb", 8.0)
+                if file_size_mb > max_size_mb:
+                    logger.info(f"Image is large ({file_size_mb:.2f} MB), optimizing for API submission")
+                # Preprocess image with document-type detection and appropriate enhancements
+                _, base64_data_url = preprocess_image_for_ocr(file_path)
+                logger.info(f"Image preprocessing completed successfully")
+            except (ImportError, AttributeError) as e:
+                # Fallback to basic processing if advanced functions not available
+                logger.warning(f"Advanced preprocessing not available: {str(e)}. Using basic image processing.")
+                # If image is larger than 8MB, resize it to reduce API payload size
+                if file_size_mb > 8:
+                    logger.info("Image is large, resizing before API submission")
+                    try:
+                        from PIL import Image
+                        import io
+                        # Open and process the image
+                        with Image.open(file_path) as img:
+                            # Convert to RGB if not already (prevents mode errors)
+                            if img.mode != 'RGB':
+                                img = img.convert('RGB')
+                            # Calculate new dimensions (maintain aspect ratio)
+                            # Target around 2000-2500 pixels on longest side for better OCR quality
+                            width, height = img.size
+                            max_dimension = max(width, height)
+                            target_dimension = 2000  # Restored to 2000 for better image quality
+                            if max_dimension > target_dimension:
+                                scale_factor = target_dimension / max_dimension
+                                resized_width = int(width * scale_factor)
+                                resized_height = int(height * scale_factor)
+                                # Use LANCZOS instead of BILINEAR for better quality
+                                img = img.resize((resized_width, resized_height), Image.LANCZOS)
+                            # Enhance contrast for better text recognition
+                            from PIL import ImageEnhance
+                            enhancer = ImageEnhance.Contrast(img)
+                            img = enhancer.enhance(1.3)
+                            # Save to bytes with compression
+                            buffer = io.BytesIO()
+                            img.save(buffer, format="JPEG", quality=92, optimize=True)  # Higher quality for better OCR
+                            buffer.seek(0)
+                            # Get the base64
+                            encoded_image = base64.b64encode(buffer.getvalue()).decode()
+                            base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+                            # Log the new size
+                            new_size_mb = len(buffer.getvalue()) / (1024 * 1024)
+                            logger.info(f"Resized image to {new_size_mb:.2f} MB")
+                    except ImportError:
+                        logger.warning("PIL not available for resizing. Using original image.")
+                        encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+                        base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+                    except Exception as e:
+                        logger.warning(f"Image resize failed: {str(e)}. Using original image.")
+                        encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+                        base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+                else:
+                    # For smaller images, use as-is
+                    encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+                    base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+            except Exception as e:
+                # Fallback to original image if any preprocessing fails
+                logger.warning(f"Image preprocessing failed: {str(e)}. Using original image.")
+                encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+                base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+            # Process the image with OCR
+            logger.info(f"Processing image with OCR using {OCR_MODEL}")
+            # Add retry logic with more retries and longer backoff periods for rate limit issues
+            max_retries = 4  # Increased from 2 to give more chances to succeed
+            retry_delay = 2  # Increased from 1 to allow for longer backoff periods
+            for retry in range(max_retries):
+                try:
+                    image_response = self.client.ocr.process(
+                        document=ImageURLChunk(image_url=base64_data_url),
+                        model=OCR_MODEL,
+                        include_image_base64=True,
+                        timeout_ms=90000  # 90 second timeout for better success rate
+                    )
+                    break  # Success, exit retry loop
+                except Exception as e:
+                    error_msg = str(e)
+                    logger.warning(f"API error on attempt {retry+1}/{max_retries}: {error_msg}")
+                    # Check specific error types to handle them appropriately
+                    error_lower = error_msg.lower()
+                    # Authentication errors - no point in retrying
+                    if "unauthorized" in error_lower or "401" in error_lower:
+                        logger.error("API authentication failed. Check your API key.")
+                        raise ValueError(f"Authentication failed with API key. Please verify your Mistral API key is correct and active: {error_msg}")
+                    # Connection errors - worth retrying
+                    elif "connection" in error_lower or "timeout" in error_lower or "520" in error_msg or "server error" in error_lower:
+                        if retry < max_retries - 1:
+                            # Wait with shorter delay before retrying
+                            wait_time = retry_delay * (2 ** retry)
+                            logger.info(f"Connection issue detected. Waiting {wait_time}s before retry...")
+                            time.sleep(wait_time)
+                        else:
+                            # Last retry failed
+                            logger.error("Maximum retries reached, API connection error persists.")
+                            raise ValueError(f"Could not connect to Mistral API after {max_retries} attempts: {error_msg}")
+                    # Rate limit errors
+                    elif "rate limit" in error_lower or "429" in error_lower or "requests rate limit exceeded" in error_lower:
+                        # Check specifically for token exhaustion vs temporary rate limit
+                        if "quota" in error_lower or "credit" in error_lower or "subscription" in error_lower:
+                            logger.error("API quota or credit limit reached. No retry will help.")
+                            raise ValueError(f"Mistral API quota or credit limit reached. Please check your subscription: {error_msg}")
+                        elif retry < max_retries - 1:
+                            # More aggressive backoff for rate limits
+                            wait_time = retry_delay * (2 ** retry) * 5  # 5x longer wait for rate limits
+                            logger.info(f"Rate limit exceeded. Waiting {wait_time}s before retry...")
+                            time.sleep(wait_time)
+                        else:
+                            # Last retry failed, try local OCR as fallback
+                            logger.error("Maximum retries reached, rate limit error persists.")
+                            try:
+                                # Try to import the local OCR fallback function
+                                from ocr_utils import try_local_ocr_fallback
+                                # Attempt local OCR fallback
+                                ocr_text = try_local_ocr_fallback(file_path, base64_data_url)
+                                if ocr_text:
+                                    logger.info("Successfully used local OCR fallback")
+                                    # Return a basic result with the local OCR text
+                                    return {
+                                        "file_name": file_path.name,
+                                        "topics": ["Document"],
+                                        "languages": ["English"],
+                                        "ocr_contents": {
+                                            "title": "Document (Local OCR)",
+                                            "content": "This document was processed with local OCR due to API rate limiting.",
+                                            "raw_text": ocr_text
+                                        },
+                                        "processing_method": "local_fallback",
+                                        "processing_note": "Used local OCR due to API rate limit"
+                                    }
+                            except (ImportError, Exception) as local_err:
+                                logger.warning(f"Local OCR fallback failed: {str(local_err)}")
+                            # If we get here, both API and local OCR failed
+                            raise ValueError(f"Mistral API rate limit exceeded. Please try again later: {error_msg}")
+                    # Other errors - no retry
+                    else:
+                        logger.error(f"Unrecoverable API error: {error_msg}")
+                        raise
+            # Get the OCR markdown from the first page
+            image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
+            # Optimize: Skip vision model step if ocr_markdown is very small or empty
+            if not image_ocr_markdown or len(image_ocr_markdown) < 50:
+                logger.warning("OCR produced minimal or no text. Returning basic result.")
+                return {
+                    "file_name": file_path.name,
+                    "topics": ["Document"],
+                    "languages": ["English"],
+                    "ocr_contents": {
+                        "raw_text": image_ocr_markdown if image_ocr_markdown else "No text could be extracted from the image."
+                    },
+                    "processing_note": "OCR produced minimal text content"
+                }
+            # Extract structured data using the appropriate model, with a single API call
+            if use_vision:
+                logger.info(f"Using vision model: {VISION_MODEL}")
+                result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name, custom_prompt)
+            else:
+                logger.info(f"Using text-only model: {TEXT_MODEL}")
+                result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name, custom_prompt)
+            # Store the serialized OCR response for image rendering (for compatibility with original version)
+            # Don't store raw_response directly as it's not JSON serializable
+            serialized_response = serialize_ocr_response(image_response)
+            result['raw_response_data'] = serialized_response
+            # Store key parts of the OCR response for image rendering
+            # With serialized format that can be stored in JSON
+            has_images = hasattr(image_response, 'pages') and image_response.pages and hasattr(image_response.pages[0], 'images') and image_response.pages[0].images
+            result['has_images'] = has_images
+            if has_images:
+                # Serialize the entire response to ensure it's JSON serializable
+                serialized_response = serialize_ocr_response(image_response)
+                # Create a structured representation of images that can be serialized
+                result['pages_data'] = []
+                if hasattr(serialized_response, 'pages'):
+                    serialized_pages = serialized_response.pages
+                else:
+                    # Handle case where serialization returns a dict instead of an object
+                    serialized_pages = serialized_response.get('pages', [])
+                for page_idx, page in enumerate(serialized_pages):
+                    # Handle both object and dict forms
+                    if isinstance(page, dict):
+                        markdown = page.get('markdown', '')
+                        images = page.get('images', [])
+                    else:
+                        markdown = page.markdown if hasattr(page, 'markdown') else ''
+                        images = page.images if hasattr(page, 'images') else []
+                    page_data = {
+                        'page_number': page_idx + 1,
+                        'markdown': markdown,
+                        'images': []
+                    }
+                    # Extract images if present
+                    for img_idx, img in enumerate(images):
+                        img_id = None
+                        img_base64 = None
+                        if isinstance(img, dict):
+                            img_id = img.get('id')
+                            img_base64 = img.get('image_base64')
+                        else:
+                            img_id = img.id if hasattr(img, 'id') else None
+                            img_base64 = img.image_base64 if hasattr(img, 'image_base64') else None
+                        if img_base64:
+                            page_data['images'].append({
+                                'id': img_id if img_id else f"img_{page_idx}_{img_idx}",
+                                'image_base64': img_base64
+                            })
+                    result['pages_data'].append(page_data)
+            logger.info("Image processing completed successfully")
+            return result
+        except Exception as e:
+            logger.error(f"Error processing image: {str(e)}")
+            # Return basic result on error
+            return {
+                "file_name": file_path.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "error": str(e),
+                "ocr_contents": {
+                    "error": f"Failed to process image: {str(e)}",
+                    "partial_text": "Image could not be processed."
+                }
+            }
+    def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename, custom_prompt=None):
+        """
+        Extract structured data using vision model with detailed historical context prompting
+        Optimized for speed, accuracy, and resilience
+        """
+        logger = logging.getLogger("vision_processor")
+        try:
+            # Fast path: Skip vision API for minimal OCR text (saves an API call)
+            if not ocr_markdown or len(ocr_markdown.strip()) < 100:  # Increased threshold for better detection
+                logger.info("Minimal OCR text detected, skipping vision model processing")
+                return {
+                    "file_name": filename,
+                    "topics": ["Document"],
+                    "languages": ["English"],
+                    "ocr_contents": {
+                        "raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
+                    }
+                }
+            # Fast path: Skip if in test mode or no API key
+            if self.test_mode or not self.api_key:
+                logger.info("Test mode or no API key, using text-only processing")
+                return self._extract_structured_data_text_only(ocr_markdown, filename)
+            # Detect document type with optimized cached implementation
+            doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
+            logger.info(f"Detected document type: {doc_type}")
+            # Optimize OCR text for processing - focus on the first part which usually contains
+            # the most important information (title, metadata, etc.)
+            if len(ocr_markdown) > 8000:
+                # Start with first 5000 chars
+                first_part = ocr_markdown[:5000]
+                # Then add representative samples from different parts of the document
+                # This captures headings and key information throughout
+                middle_start = len(ocr_markdown) // 2 - 1000
+                middle_part = ocr_markdown[middle_start:middle_start+2000] if middle_start > 0 else ""
+                # Get ending section if large enough
+                if len(ocr_markdown) > 15000:
+                    end_part = ocr_markdown[-1000:]
+                    truncated_ocr = f"{first_part}\n...\n{middle_part}\n...\n{end_part}"
+                else:
+                    truncated_ocr = f"{first_part}\n...\n{middle_part}"
+                logger.info(f"Truncated OCR text from {len(ocr_markdown)} to {len(truncated_ocr)} chars")
+            else:
+                truncated_ocr = ocr_markdown
+            # Build an optimized prompt based on document type
+            enhanced_prompt = self._build_enhanced_prompt(doc_type, truncated_ocr, custom_prompt)
+            # Measure API call time for optimization feedback
+            start_time = time.time()
+            try:
+                # Try with enhanced timing parameters based on document complexity
+                # Use shorter timeout for smaller documents
+                timeout_ms = min(120000, max(60000, len(truncated_ocr) * 10))  # 60-120 seconds based on text length
+                logger.info(f"Calling vision model with {timeout_ms}ms timeout and document type {doc_type}")
+                chat_response = self.client.chat.parse(
+                    model=VISION_MODEL,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                ImageURLChunk(image_url=image_base64),
+                                TextChunk(text=enhanced_prompt)
+                            ],
+                        },
+                    ],
+                    response_format=StructuredOCRModel,
+                    temperature=0,
+                    timeout_ms=timeout_ms
+                )
+                api_time = time.time() - start_time
+                logger.info(f"Vision model completed in {api_time:.2f}s with document type: {doc_type}")
+            except Exception as e:
+                # If there's an error with the enhanced prompt, try progressively simpler approaches
+                logger.warning(f"Enhanced prompt failed after {time.time() - start_time:.2f}s: {str(e)}")
+                # Try a simplified approach with less context
+                try:
+                    # Shorter prompt with less contextual information
+                    simplified_prompt = (
+                        f"You are an expert in historical document analysis. "
+                        f"Analyze this document image and the OCR text below. "
+                        f"<BEGIN_OCR>\n{truncated_ocr[:4000]}\n<END_OCR>\n"
+                        f"Identify the document type, main topics, languages used, and extract key information "
+                        f"including names, dates, places, and events. Return a structured JSON response."
+                    )
+                    # Add custom prompt if provided
+                    if custom_prompt:
+                        simplified_prompt += f"\n\nAdditional instructions: {custom_prompt}"
+                    logger.info(f"Trying simplified prompt approach")
+                    chat_response = self.client.chat.parse(
+                        model=VISION_MODEL,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    ImageURLChunk(image_url=image_base64),
+                                    TextChunk(text=simplified_prompt)
+                                ],
+                            },
+                        ],
+                        response_format=StructuredOCRModel,
+                        temperature=0,
+                        timeout_ms=60000  # Shorter timeout for simplified approach
+                    )
+                    logger.info(f"Simplified prompt approach succeeded")
+                except Exception as second_e:
+                    # If that fails, try with minimal prompt and just image analysis
+                    logger.warning(f"Simplified prompt failed: {str(second_e)}. Trying minimal prompt.")
+                    try:
+                        # Minimal prompt focusing on just the image
+                        minimal_prompt = (
+                            f"Analyze this historical document image. "
+                            f"Extract the document type, main topics, languages, and key information. "
+                            f"Provide your analysis in a structured JSON format."
+                        )
+                        logger.info(f"Trying minimal prompt with image-only focus")
+                        chat_response = self.client.chat.parse(
+                            model=VISION_MODEL,
+                            messages=[
+                                {
+                                    "role": "user",
+                                    "content": [
+                                        ImageURLChunk(image_url=image_base64),
+                                        TextChunk(text=minimal_prompt)
+                                    ],
+                                },
+                            ],
+                            response_format=StructuredOCRModel,
+                            temperature=0,
+                            timeout_ms=45000  # Even shorter timeout for minimal approach
+                        )
+                        logger.info(f"Minimal prompt approach succeeded")
+                    except Exception as third_e:
+                        # If all vision attempts fail, fall back to text-only model
+                        logger.warning(f"All vision model attempts failed, falling back to text-only model: {str(third_e)}")
+                        return self._extract_structured_data_text_only(ocr_markdown, filename)
+            # Convert the response to a dictionary
+            result = json.loads(chat_response.choices[0].message.parsed.json())
+            # Ensure languages is a list of strings, not Language enum objects
+            if 'languages' in result:
+                result['languages'] = [str(lang) for lang in result.get('languages', [])]
+            # Add metadata about processing
+            result['processing_info'] = {
+                'method': 'vision_model',
+                'document_type': doc_type,
+                'ocr_text_length': len(ocr_markdown),
+                'api_response_time': time.time() - start_time
+            }
+            # Add confidence score if not present
+            if 'confidence_score' not in result:
+                result['confidence_score'] = 0.92  # Vision model typically has higher confidence
+        except Exception as e:
+            # Fall back to text-only model if vision model fails
+            logger.warning(f"Vision model processing failed, falling back to text-only model: {str(e)}")
+            result = self._extract_structured_data_text_only(ocr_markdown, filename)
+        return result
+    # Thread-safe document type detection cache with increased size for better performance
+    _doc_type_cache = {}
+    _doc_type_cache_size = 256
+    @staticmethod
+    def _detect_document_type_cached(custom_prompt: Optional[str], ocr_text_sample: str) -> str:
+        """
+        Cached version of document type detection logic with thread-safe implementation
+        """
+        # Generate cache key - use first 50 chars of prompt and ocr_text to avoid memory issues
+        prompt_key = str(custom_prompt)[:50] if custom_prompt else ""
+        text_key = ocr_text_sample[:50] if ocr_text_sample else ""
+        cache_key = f"{prompt_key}::{text_key}"
+        # Check cache first (fast path)
+        if cache_key in StructuredOCR._doc_type_cache:
+            return StructuredOCR._doc_type_cache[cache_key]
+        # Set default document type
+        doc_type = "general"
+        # Optimized pattern matching with compiled lookup dictionaries
+        doc_type_patterns = {
+            "handwritten": ["handwritten", "handwriting", "cursive", "manuscript"],
+            "letter": ["letter", "correspondence", "message", "dear sir", "dear madam", "sincerely", "yours truly"],
+            "legal": ["form", "contract", "agreement", "legal", "certificate", "court", "attorney", "plaintiff", "defendant"],
+            "recipe": ["recipe", "food", "ingredients", "directions", "tbsp", "tsp", "cup", "mix", "bake", "cooking"],
+            "travel": ["travel", "expedition", "journey", "exploration", "voyage", "destination", "map"],
+            "scientific": ["scientific", "experiment", "hypothesis", "research", "study", "analysis", "results", "procedure"],
+            "newspaper": ["news", "newspaper", "article", "press", "headline", "column", "editor"]
+        }
+        # Fast custom prompt matching
+        if custom_prompt:
+            prompt_lower = custom_prompt.lower()
+            # Optimized pattern matching with early exit
+            for detected_type, patterns in doc_type_patterns.items():
+                if any(term in prompt_lower for term in patterns):
+                    doc_type = detected_type
+                    break
+        # Fast OCR text matching if still general type
+        if doc_type == "general" and ocr_text_sample:
+            ocr_lower = ocr_text_sample.lower()
+            # Use the same patterns dictionary for consistency, but scan the OCR text
+            for detected_type, patterns in doc_type_patterns.items():
+                if any(term in ocr_lower for term in patterns):
+                    doc_type = detected_type
+                    break
+        # Cache the result with improved LRU-like behavior
+        if len(StructuredOCR._doc_type_cache) >= StructuredOCR._doc_type_cache_size:
+            # Clear multiple entries at once for better performance
+            try:
+                # Remove up to 20 entries to avoid frequent cache clearing
+                for _ in range(20):
+                    if StructuredOCR._doc_type_cache:
+                        StructuredOCR._doc_type_cache.pop(next(iter(StructuredOCR._doc_type_cache)))
+            except:
+                # If concurrent modification causes issues, just proceed
+                pass
+        # Store in cache
+        StructuredOCR._doc_type_cache[cache_key] = doc_type
+        return doc_type
+    def _detect_document_type(self, custom_prompt: Optional[str], ocr_text: str) -> str:
+        """
+        Detect document type based on content and custom prompt.
+        Args:
+            custom_prompt: User-provided custom prompt
+            ocr_text: OCR-extracted text
+        Returns:
+            Document type identifier ("handwritten", "printed", "letter", etc.)
+        """
+        # Only sample first 1000 characters of OCR text for faster processing while maintaining accuracy
+        ocr_sample = ocr_text[:1000] if ocr_text else ""
+        # Use the cached version for better performance
+        return self._detect_document_type_cached(custom_prompt, ocr_sample)
+    def _build_enhanced_prompt(self, doc_type: str, ocr_text: str, custom_prompt: Optional[str]) -> str:
+        """
+        Build an enhanced prompt based on document type.
+        Args:
+            doc_type: Detected document type
+            ocr_text: OCR-extracted text
+            custom_prompt: User-provided custom prompt
+        Returns:
+            Enhanced prompt optimized for the document type
+        """
+        # Generic document section (included in all prompts)
+        generic_section = (
+            f"This is a historical document's OCR text:\n"
+            f"<BEGIN_OCR>\n{ocr_text}\n<END_OCR>\n\n"
+        )
+        # Document-specific prompting
+        if doc_type == "handwritten":
+            specific_section = (
+                f"You are an expert historian specializing in handwritten document transcription and analysis. "
+                f"The OCR system has attempted to capture the handwriting, but may have made errors with cursive script "
+                f"or unusual letter formations.\n\n"
+                f"Pay careful attention to:\n"
+                f"- Correcting OCR errors common in handwriting recognition\n"
+                f"- Preserving the original document structure\n"
+                f"- Identifying topics, language(s), and document type accurately\n"
+                f"- Detecting any names, dates, places, or events mentioned\n"
+            )
+        elif doc_type == "letter":
+            specific_section = (
+                f"You are an expert in historical correspondence analysis. "
+                f"Analyze this letter as a historian would, identifying:\n"
+                f"- Sender and recipient (if mentioned)\n"
+                f"- Date and location of writing (if present)\n"
+                f"- Key topics discussed\n"
+                f"- Historical context and significance\n"
+                f"- Sentiment and tone of the communication\n"
+                f"- Closing formulations and signature\n"
+            )
+        elif doc_type == "recipe":
+            specific_section = (
+                f"You are a culinary historian specializing in historical recipes. "
+                f"Analyze this recipe document to extract:\n"
+                f"- Recipe name/title\n"
+                f"- Complete list of ingredients with measurements\n"
+                f"- Preparation instructions in correct order\n"
+                f"- Cooking time and temperature if mentioned\n"
+                f"- Serving suggestions or yield information\n"
+                f"- Any cultural or historical context provided\n"
+            )
+        elif doc_type == "travel":
+            specific_section = (
+                f"You are a historian specializing in historical travel and exploration accounts. "
+                f"Analyze this document to extract:\n"
+                f"- Geographical locations mentioned\n"
+                f"- Names of explorers, ships, or expeditions\n"
+                f"- Dates and timelines\n"
+                f"- Descriptions of indigenous peoples, cultures, or local conditions\n"
+                f"- Natural features, weather, or navigational details\n"
+                f"- Historical significance of the journey described\n"
+            )
+        elif doc_type == "scientific":
+            specific_section = (
+                f"You are a historian of science specializing in historical scientific documents. "
+                f"Analyze this document to extract:\n"
+                f"- Scientific methodology described\n"
+                f"- Observations, measurements, or data presented\n"
+                f"- Scientific terminology of the period\n"
+                f"- Experimental apparatus or tools mentioned\n"
+                f"- Conclusions or hypotheses presented\n"
+                f"- Historical significance within scientific development\n"
+            )
+        elif doc_type == "newspaper":
+            specific_section = (
+                f"You are a media historian specializing in historical newspapers and publications. "
+                f"Analyze this document to extract:\n"
+                f"- Publication name and date if present\n"
+                f"- Headlines and article titles\n"
+                f"- Main news content with focus on events, people, and places\n"
+                f"- Advertisement content if present\n"
+                f"- Historical context and significance\n"
+                f"- Editorial perspective or bias if detectable\n"
+            )
+        elif doc_type == "legal":
+            specific_section = (
+                f"You are a legal historian specializing in historical legal documents. "
+                f"Analyze this document to extract:\n"
+                f"- Document type (contract, certificate, will, deed, etc.)\n"
+                f"- Parties involved and their roles\n"
+                f"- Key terms, conditions, or declarations\n"
+                f"- Dates, locations, and jurisdictions mentioned\n"
+                f"- Legal terminology of the period\n"
+                f"- Signatures, witnesses, or official markings\n"
+            )
+        else:
+            # General historical document
+            specific_section = (
+                f"You are a historian specializing in historical document analysis. "
+                f"Analyze this document to extract:\n"
+                f"- Document type and purpose\n"
+                f"- Time period and historical context\n"
+                f"- Key topics, themes, and subjects\n"
+                f"- People, places, and events mentioned\n"
+                f"- Languages used and writing style\n"
+                f"- Historical significance and connections\n"
+            )
+        # Output instructions
+        output_section = (
+            f"Create a structured JSON response with the following fields:\n"
+            f"- file_name: The document's name\n"
+            f"- topics: An array of topics covered in the document\n"
+            f"- languages: An array of languages used in the document\n"
+            f"- ocr_contents: A dictionary with the document's contents, organized logically\n"
+        )
+        # Add custom prompt if provided
+        custom_section = ""
+        if custom_prompt:
+            custom_section = f"\n\nADDITIONAL CONTEXT AND INSTRUCTIONS:\n{custom_prompt}\n"
+        # Combine all sections into complete prompt
+        return generic_section + specific_section + output_section + custom_section
+    def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
+        """
+        Extract structured data using text-only model with detailed historical context prompting
+        and improved error handling
+        """
+        logger = logging.getLogger("text_processor")
+        start_time = time.time()
+        try:
+            # Fast path: Skip for minimal OCR text
+            if not ocr_markdown or len(ocr_markdown.strip()) < 50:
+                logger.info("Minimal OCR text - returning basic result")
+                return {
+                    "file_name": filename,
+                    "topics": ["Document"],
+                    "languages": ["English"],
+                    "ocr_contents": {
+                        "raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
+                    },
+                    "processing_method": "minimal_text"
+                }
+            # Check for API key to avoid unnecessary processing
+            if self.test_mode or not self.api_key:
+                logger.info("Test mode or no API key - returning basic result")
+                return {
+                    "file_name": filename,
+                    "topics": ["Document"],
+                    "languages": ["English"],
+                    "ocr_contents": {
+                        "raw_text": ocr_markdown[:10000] if ocr_markdown else "No text could be extracted",
+                        "note": "API key not provided - showing raw OCR text only"
+                    },
+                    "processing_method": "test_mode"
+                }
+            # Detect document type and build enhanced prompt
+            doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
+            logger.info(f"Detected document type: {doc_type}")
+            # If OCR text is very large, truncate it to avoid API limits
+            truncated_text = ocr_markdown
+            if len(ocr_markdown) > 25000:
+                # Keep first 15000 chars and last 5000 chars
+                truncated_text = ocr_markdown[:15000] + "\n...[content truncated]...\n" + ocr_markdown[-5000:]
+                logger.info(f"OCR text truncated from {len(ocr_markdown)} to {len(truncated_text)} chars")
+            # Build the prompt with truncated text if needed
+            enhanced_prompt = self._build_enhanced_prompt(doc_type, truncated_text, custom_prompt)
+            # Use enhanced prompt with text-only model - with retry logic
+            max_retries = 2
+            retry_delay = 1
+            for retry in range(max_retries):
+                try:
+                    logger.info(f"Calling text model ({TEXT_MODEL})")
+                    api_start = time.time()
+                    # Set appropriate timeout based on text length
+                    timeout_ms = min(120000, max(30000, len(truncated_text) * 5))  # 30-120s based on length
+                    # Make API call with appropriate timeout
+                    chat_response = self.client.chat.parse(
+                        model=TEXT_MODEL,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": enhanced_prompt
+                            },
+                        ],
+                        response_format=StructuredOCRModel,
+                        temperature=0,
+                        timeout_ms=timeout_ms
+                    )
+                    api_time = time.time() - api_start
+                    logger.info(f"Text model API call completed in {api_time:.2f}s")
+                    # Convert the response to a dictionary
+                    result = json.loads(chat_response.choices[0].message.parsed.json())
+                    # Ensure languages is a list of strings, not Language enum objects
+                    if 'languages' in result:
+                        result['languages'] = [str(lang) for lang in result.get('languages', [])]
+                    # Add processing metadata
+                    result['processing_method'] = 'text_model'
+                    result['document_type'] = doc_type
+                    result['model_used'] = TEXT_MODEL
+                    result['processing_time'] = time.time() - start_time
+                    # Add raw text for reference if not already present
+                    if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
+                        # Add truncated raw text if very large
+                        if len(ocr_markdown) > 50000:
+                            result['ocr_contents']['raw_text'] = ocr_markdown[:50000] + "\n...[content truncated]..."
+                        else:
+                            result['ocr_contents']['raw_text'] = ocr_markdown
+                    return result
+                except Exception as api_error:
+                    error_msg = str(api_error).lower()
+                    logger.warning(f"API error on attempt {retry+1}/{max_retries}: {str(api_error)}")
+                    # Check if retry would help
+                    if retry < max_retries - 1:
+                        # Rate limit errors - special handling with longer wait
+                        if any(term in error_msg for term in ["rate limit", "429", "too many requests", "requests rate limit exceeded"]):
+                            # Check specifically for token exhaustion vs temporary rate limit
+                            if any(term in error_msg for term in ["quota", "credit", "subscription"]):
+                                logger.error("API quota or credit limit reached. No retry will help.")
+                                raise ValueError(f"Mistral API quota or credit limit reached. Please check your subscription: {error_msg}")
+                            # Longer backoff for rate limit errors
+                            wait_time = retry_delay * (2 ** retry) * 6.0  # 6x longer wait for rate limits
+                            logger.info(f"Rate limit exceeded. Waiting {wait_time:.1f}s before retry...")
+                            time.sleep(wait_time)
+                        # Other transient errors
+                        elif any(term in error_msg for term in ["timeout", "connection", "500", "503", "504"]):
+                            # Wait before retrying
+                            wait_time = retry_delay * (2 ** retry)
+                            logger.info(f"Transient error, retrying in {wait_time}s")
+                            time.sleep(wait_time)
+                        else:
+                            # Non-retryable error
+                            raise
+                    else:
+                        # Last retry failed
+                        raise
+            # This shouldn't be reached due to raise in the loop, but just in case
+            raise Exception("All retries failed for text model")
+        except Exception as e:
+            logger.error(f"Text model failed: {str(e)}. Creating basic result.")
+            # Create a basic result with available OCR text
+            try:
+                # Create a more informative fallback result
+                result = {
+                    "file_name": filename,
+                    "topics": ["Document"],
+                    "languages": ["English"],
+                    "ocr_contents": {
+                        "raw_text": ocr_markdown[:50000] if ocr_markdown else "No text could be extracted",
+                        "error": f"AI processing failed: {str(e)}"
+                    },
+                    "processing_method": "fallback",
+                    "processing_error": str(e),
+                    "processing_time": time.time() - start_time
+                }
+                # Try to extract some basic metadata even without AI
+                if ocr_markdown:
+                    # Simple content analysis
+                    text_sample = ocr_markdown[:5000].lower()
+                    # Try to detect language
+                    if "dear" in text_sample and any(word in text_sample for word in ["sincerely", "regards", "truly"]):
+                        result["topics"].append("Letter")
+                    elif any(word in text_sample for word in ["recipe", "ingredients", "instructions", "cook", "bake"]):
+                        result["topics"].append("Recipe")
+                    elif any(word in text_sample for word in ["article", "report", "study", "analysis"]):
+                        result["topics"].append("Article")
+            except Exception as inner_e:
+                logger.error(f"Error creating basic result: {str(inner_e)}")
+                result = {
+                    "file_name": str(filename) if filename else "unknown",
+                    "topics": ["Document"],
+                    "languages": ["English"],
+                    "ocr_contents": {
+                        "error": "Processing failed completely",
+                        "partial_text": ocr_markdown[:1000] if ocr_markdown else "Document could not be processed."
+                    }
+                }
+        return result
+# For testing directly
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python structured_ocr.py <file_path>")
+        sys.exit(1)
+    file_path = sys.argv[1]
+    processor = StructuredOCR()
+    result = processor.process_file(file_path)
+    print(json.dumps(result, indent=2))

ui/__pycache__/layout.cpython-312.pyc ADDED Viewed

Binary file (7.71 kB). View file

ui/__pycache__/layout.cpython-313.pyc ADDED Viewed

Binary file (7.62 kB). View file

ui/custom.css ADDED Viewed

	@@ -0,0 +1,67 @@

+/* Minimal essential styling */
+/* Processing status container */
+.processing-status-container {
+    margin: 10px 0;
+    padding: 8px 12px;
+    border-left: 3px solid #5c6bc0;
+    font-size: 0.9rem;
+}
+/* Result card styling */
+.previous-results-container {
+    margin-bottom: 20px;
+}
+.result-card {
+    border: 1px solid #e0e0e0;
+    border-radius: 4px;
+    padding: 15px;
+    margin-bottom: 15px;
+}
+.result-header {
+    display: flex;
+    justify-content: space-between;
+    margin-bottom: 10px;
+    padding-bottom: 5px;
+    border-bottom: 1px solid #e0e0e0;
+}
+.result-filename {
+    font-weight: bold;
+    font-size: 1.1rem;
+}
+.result-date {
+    font-size: 0.9rem;
+    color: #666;
+}
+.result-metadata {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    margin-bottom: 10px;
+}
+.result-tag {
+    background-color: #e3f2fd;
+    border-radius: 16px;
+    padding: 3px 10px;
+    font-size: 0.85rem;
+    color: #1565c0;
+}
+.selected-result-container {
+    border: 1px solid #e0e0e0;
+    border-radius: 4px;
+    padding: 20px;
+    margin: 15px 0;
+}
+.selected-result-title {
+    font-size: 1.3rem;
+    font-weight: bold;
+    margin-bottom: 15px;
+}

ui/layout.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import streamlit as st
+from pathlib import Path
+import os
+# Load custom CSS
+def load_css():
+    css_file = Path(__file__).parent / "custom.css"
+    if css_file.exists():
+        with open(css_file) as f:
+            st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+    else:
+        st.warning("Custom CSS file not found. Some styles may be missing.")
+# Header component
+def header():
+    st.markdown("""
+    <div class="main-header">
+        <h1 class="title-text">Historical OCR Workshop</h1>
+    </div>
+    """, unsafe_allow_html=True)
+# Create a page wrapper similar to the React component
+def page_wrapper(content_function, current_module=1):
+    """
+    Creates a consistent page layout with navigation
+    Args:
+        content_function: Function that renders the page content
+        current_module: Current module number (1-6)
+    """
+    # Load custom CSS
+    load_css()
+    # Display header
+    header()
+    # Ensure session state for navigation
+    if 'current_module' not in st.session_state:
+        st.session_state.current_module = current_module
+    # Main content area with bottom padding for the nav
+    st.markdown('<div class="main-content">', unsafe_allow_html=True)
+    # Call the content function to render the module content
+    content_function()
+    # Add spacer for fixed nav
+    st.markdown('<div class="footer-spacer"></div>', unsafe_allow_html=True)
+    # Navigation
+    render_navigation(current_module)
+    st.markdown('</div>', unsafe_allow_html=True)
+# Navigation component
+def render_navigation(current_module):
+    # Define modules names like in React
+    modules = ['Introduction', 'Historical Context', 'Methodology', 'Case Studies', 'Interactive OCR', 'Conclusion']
+    # Navigation container
+    st.markdown(f"""
+    <div class="nav-container">
+        <div class="nav-buttons">
+            {prev_button_html(current_module, modules)}
+            {next_button_html(current_module, modules)}
+        </div>
+        <div class="nav-dots">
+            {nav_dots_html(current_module, modules)}
+        </div>
+    </div>
+    """, unsafe_allow_html=True)
+# Previous button HTML
+def prev_button_html(current_module, modules):
+    if current_module > 1:
+        prev_module = current_module - 1
+        return f"""
+        <button class="prev-button"
+                onclick="document.getElementById('nav_prev_{prev_module}').click()"
+                aria-label="Go to previous module: {modules[prev_module-1]}">
+            ← Previous
+        </button>
+        """
+    return ""
+# Next button HTML
+def next_button_html(current_module, modules):
+    if current_module < len(modules):
+        next_module = current_module + 1
+        return f"""
+        <button class="next-button"
+                onclick="document.getElementById('nav_next_{next_module}').click()"
+                aria-label="Go to next module: {modules[next_module-1]}">
+            Next →
+        </button>
+        """
+    return ""
+# Navigation dots HTML
+def nav_dots_html(current_module, modules):
+    dots_html = ""
+    for i, name in enumerate(modules, 1):
+        active_class = "active" if i == current_module else ""
+        dots_html += f"""
+        <a class="nav-dot {active_class}"
+           onclick="document.getElementById('nav_dot_{i}').click()"
+           aria-current="{i == current_module}"
+           aria-label="Go to module {i}: {name}">
+            {i}
+        </a>
+        """
+    return dots_html
+# Helper functions for container styles
+def gray_container(content, padding="1.5rem"):
+    """Renders content in a gray container with consistent styling"""
+    st.markdown(f'<div class="content-container" style="padding:{padding};">{content}</div>', unsafe_allow_html=True)
+def blue_container(content, padding="1.5rem"):
+    """Renders content in a blue container with consistent styling"""
+    st.markdown(f'<div class="blue-container" style="padding:{padding};">{content}</div>', unsafe_allow_html=True)
+def yellow_container(content, padding="1.5rem"):
+    """Renders content in a yellow container with consistent styling"""
+    st.markdown(f'<div class="yellow-container" style="padding:{padding};">{content}</div>', unsafe_allow_html=True)
+def card_grid(cards):
+    """
+    Renders a responsive grid of cards
+    Args:
+        cards: List of HTML strings for each card
+    """
+    grid_html = '<div class="card-grid">'
+    for card in cards:
+        grid_html += f'<div class="card">{card}</div>'
+    grid_html += '</div>'
+    st.markdown(grid_html, unsafe_allow_html=True)
+def module_card(number, title, description):
+    """Creates a styled module card"""
+    return f"""
+    <div class="module-card">
+        <div class="module-number">Module {number}</div>
+        <div class="module-title">{title}</div>
+        <p>{description}</p>
+    </div>
+    """
+def key_concept(content):
+    """Renders a key concept box"""
+    st.markdown(f'<div class="key-concept">{content}</div>', unsafe_allow_html=True)
+def research_question(content):
+    """Renders a research question box"""
+    st.markdown(f'<div class="research-question">{content}</div>', unsafe_allow_html=True)
+def quote(content, author=""):
+    """Renders a quote with optional author"""
+    quote_html = f'<div class="quote-container">{content}'
+    if author:
+        quote_html += f'<br/><br/><span style="font-size:0.9rem; text-align:right; display:block;">— {author}</span>'
+    quote_html += '</div>'
+    st.markdown(quote_html, unsafe_allow_html=True)
+def tool_container(content):
+    """Renders content in a tool container"""
+    st.markdown(f'<div class="tool-container">{content}</div>', unsafe_allow_html=True)
+def upload_container(content):
+    """Renders content in an upload container"""
+    st.markdown(f'<div class="upload-container">{content}</div>', unsafe_allow_html=True)