Spaces:

milwright
/

historical-ocr

Running

milwright commited on Apr 24

Commit

aabc02c

1 Parent(s): 3c4dfc4

Improve language detection with mistral-ocr-latest model

Updates the OCR processing logic to:
1. Extract language information directly from the mistral-ocr-latest model response
2. Consolidate language detections across multiple pages for PDF documents
3. Add language_detection_source metadata to indicate when using direct model detection
4. Reduce reliance on manual language detection prompts when model provides this data
5. Maintain backward compatibility with existing code

Files changed (7) hide show

app.py +205 -143
ocr_processing.py +128 -16
ocr_utils.py +50 -13
structured_ocr.py +277 -387
ui/layout.py +265 -143
ui_components.py +709 -420
utils.py +68 -4

app.py CHANGED Viewed

@@ -1,15 +1,23 @@
 import os
-import streamlit as st
 import json
 import sys
 import time
 import base64
-from pathlib import Path
 import io
-from datetime import datetime
 import logging
-# Import modules
 from preprocessing import convert_pdf_to_images, preprocess_image
 from ocr_processing import process_file
 from ui_components import (
@@ -31,19 +39,10 @@ from constants import (
     CUSTOM_PROMPT_TEMPLATES,
     LAYOUT_PROMPT_ADDITIONS
 )
-# Import the StructuredOCR class and config from the local files
 from structured_ocr import StructuredOCR
 from config import MISTRAL_API_KEY
-# Import utilities for handling previous results
 from ocr_utils import create_results_zip
-# Configure logging
-logging.basicConfig(level=logging.INFO,
-                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("app")
 # Set favicon path
 favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
@@ -52,28 +51,41 @@ st.set_page_config(
     page_title="Historical OCR",
     page_icon=favicon_path if os.path.exists(favicon_path) else "📜",
     layout="wide",
-    initial_sidebar_state="expanded"
 )
-def initialize_session_state():
-    """Initialize all session state variables"""
-    # Initialize session state for storing previous results if not already present
     if 'previous_results' not in st.session_state:
         st.session_state.previous_results = []
-    # Initialize temp file tracking
     if 'temp_file_paths' not in st.session_state:
         st.session_state.temp_file_paths = []
-    # Initialize last processed file tracking to fix "Process Document Again" button
     if 'last_processed_file' not in st.session_state:
         st.session_state.last_processed_file = None
-    # Important: Initialize the reset flag
-    if 'perform_reset' not in st.session_state:
-        st.session_state.perform_reset = False
-    # Initialize other session state variables
     if 'auto_process_sample' not in st.session_state:
         st.session_state.auto_process_sample = False
     if 'sample_just_loaded' not in st.session_state:
@@ -90,64 +102,62 @@ def initialize_session_state():
         st.session_state.original_sample_name = None
     if 'is_sample_document' not in st.session_state:
         st.session_state.is_sample_document = False
-    # Check if we need to perform a complete reset (coming from "Close Document" button)
-    if 'perform_reset' in st.session_state and st.session_state.perform_reset:
-        # Save previous results
-        previous_results = st.session_state.previous_results
-        # Clean up any temporary files
-        if 'temp_file_paths' in st.session_state and st.session_state.temp_file_paths:
-            handle_temp_files(st.session_state.temp_file_paths)
-        # Clear all session state variables except previous_results
-        for key in list(st.session_state.keys()):
-            if key not in ['previous_results']:
-                # We will manually reset the perform_reset flag at the end
-                if key != 'perform_reset':
-                    st.session_state.pop(key, None)
-        # Restore previous results
-        st.session_state.previous_results = previous_results
-        # Reinitialize session state variables
-        st.session_state.temp_file_paths = []
-        st.session_state.last_processed_file = None
-        st.session_state.auto_process_sample = False
-        st.session_state.sample_just_loaded = False
-        st.session_state.processed_document_active = False
-        st.session_state.sample_document_processed = False
-        st.session_state.sample_document = None
-        st.session_state.original_sample_bytes = None
-        st.session_state.original_sample_name = None
-        st.session_state.is_sample_document = False
-        # Turn off reset flag - this must be done last
-        st.session_state.perform_reset = False
-        # Force this to be a complete reset cycle
-        return
 def show_example_documents():
     """Show example documents section"""
-    st.subheader("Example Documents")
-    # Add a simplified info message about examples
     st.markdown("""
     This app can process various historical documents:
     - Historical photographs, maps, and manuscripts
     - Handwritten letters and documents
     - Printed books and articles
     - Multi-page PDFs
-    """)
-    # Add CSS to make the dropdown match the column width
-    st.markdown("""
     <style>
     /* Make the selectbox container match the full column width */
     .main .block-container .element-container:has([data-testid="stSelectbox"]) {
         width: 100% !important;
         max-width: 100% !important;
     }
     /* Make the actual selectbox control take the full width */
@@ -155,6 +165,11 @@ def show_example_documents():
         width: 100% !important;
         max-width: 100% !important;
     }
     </style>
     """, unsafe_allow_html=True)
@@ -166,7 +181,6 @@ def show_example_documents():
         "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
         "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
         "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
-        "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/baldwin-15th-north.jpg"
     ]
     sample_names = [
@@ -175,9 +189,8 @@ def show_example_documents():
         "The Magician (Image)",
         "Handwritten Letter (Image)",
         "Magellan Travels (Image)",
-        "Milgram Flier (Image)",
-        "Baldwin Street (Image)"
-    ]
     # Initialize sample_document in session state if it doesn't exist
     if 'sample_document' not in st.session_state:
@@ -188,8 +201,8 @@ def show_example_documents():
     if selected_sample > 0:
         selected_url = sample_urls[selected_sample]
-        # Add process button for the sample document
-        if st.button("Load Sample Document"):
             try:
                 import requests
                 from io import BytesIO
@@ -254,9 +267,10 @@ def show_example_documents():
                         content_type=content_type
                     )
-                    # Store original bytes for reprocessing
                     st.session_state.original_sample_bytes = response.content
                     st.session_state.original_sample_name = file_name
                     # Set state flags
                     st.session_state.sample_just_loaded = True
@@ -264,7 +278,8 @@ def show_example_documents():
                     # Generate a unique identifier for the sample document
                     st.session_state.last_processed_file = f"{file_name}_{len(response.content)}"
-                    # Force rerun to load the document
                     st.rerun()
             except Exception as e:
                 st.error(f"Error downloading sample document: {str(e)}")
@@ -288,20 +303,21 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
     # Check if this is a new file (different from the last processed file)
     current_file_identifier = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
     if st.session_state.last_processed_file != current_file_identifier:
         # Reset processed_document_active if a new file is uploaded
         st.session_state.processed_document_active = False
     # Process button - flush left with similar padding as file browser
     with left_col:
-        # Use a key for the button based on state to force re-creation
-        button_key = "process_again" if st.session_state.processed_document_active else "process_initial"
-        # Show appropriate button text based on state
-        button_text = "Process Document Again" if st.session_state.processed_document_active else "Process Document"
-        # Create the button
-        process_button = st.button(button_text, key=button_key)
         # Handle sample document recreation if needed
         if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
@@ -333,39 +349,42 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
         # Positioned right after the process button for better visibility
         progress_placeholder = st.empty()
-        # Image preprocessing preview - automatically show only the preprocessed version
-        if any(sidebar_options["preprocessing_options"].values()) and uploaded_file.type.startswith('image/'):
             st.markdown("**Preprocessed Preview**")
             try:
-                # Create a container for the preview to better control layout
                 with st.container():
                     processed_bytes = preprocess_image(uploaded_file.getvalue(), sidebar_options["preprocessing_options"])
-                    # Use use_container_width=True for responsive design
-                    st.image(io.BytesIO(processed_bytes), use_container_width=True)
-                # Show preprocessing metadata in a well-formatted caption
-                meta_items = []
-                if sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
-                    meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
-                if sidebar_options["preprocessing_options"].get("grayscale", False):
-                    meta_items.append("Grayscale")
-                if sidebar_options["preprocessing_options"].get("denoise", False):
-                    meta_items.append("Denoise")
-                if sidebar_options["preprocessing_options"].get("contrast", 0) != 0:
-                    meta_items.append(f"Contrast ({sidebar_options['preprocessing_options']['contrast']})")
-                if sidebar_options["preprocessing_options"].get("rotation", 0) != 0:
-                    meta_items.append(f"Rotation ({sidebar_options['preprocessing_options']['rotation']}°)")
-                # Only show "Applied:" if there are actual preprocessing steps
-                if meta_items:
-                    meta_text = "Applied: " + ", ".join(meta_items)
-                    st.caption(meta_text)
             except Exception as e:
                 st.error(f"Error in preprocessing: {str(e)}")
                 st.info("Try using grayscale preprocessing for PNG images with transparency")
         # Container for success message (will be filled after processing)
-        # No extra spacing needed as it will be managed programmatically
         metadata_placeholder = st.empty()
     # Check if this is an auto-processing situation
@@ -389,7 +408,12 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
         progress_reporter = ProgressReporter(progress_placeholder).setup()
         try:
-            # Process the document
             result = process_file(
                 uploaded_file=uploaded_file,
                 use_vision=sidebar_options["use_vision"],
@@ -402,6 +426,39 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
                 perf_mode=sidebar_options.get("perf_mode", "Quality")
             )
             # Display results
             display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
@@ -415,27 +472,6 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
             if uploaded_file is not None:
                 st.session_state.last_processed_file = current_file_identifier
-            # Display success message with close button for dismissing processed documents
-            success_cols = st.columns([5, 1])
-            with success_cols[0]:
-                metadata_placeholder.success("**Document processed successfully**")
-            with success_cols[1]:
-                # Define a function to clear document state
-                def clear_document_state():
-                    # Reset all document-related session state
-                    st.session_state.processed_document_active = False
-                    st.session_state.sample_document = None
-                    st.session_state.last_processed_file = None
-                    # Clear any remaining state flag if we're showing examples
-                    st.session_state.perform_reset = True
-                # Create the close button with a callback
-                st.button("✕ Close Document",
-                          key="close_document_button",
-                          help="Clear current document and start over",
-                          on_click=clear_document_state)
             # Store the result in the previous results list
             # Add timestamp to result for history tracking
             result_copy = result.copy()
@@ -460,7 +496,21 @@ def process_document(uploaded_file, left_col, right_col, sidebar_options):
 def main():
     """Main application function"""
     # Initialize session state
-    initialize_session_state()
     # Apply custom CSS
     from ui.layout import load_css
@@ -469,19 +519,26 @@ def main():
     # Create sidebar options
     sidebar_options = create_sidebar_options()
-    # Create main layout with tabs
-    main_tab1, main_tab2, main_tab3 = st.tabs(["Document Processing", "Previous Results", "About"])
     with main_tab1:
-        # Create a two-column layout for file upload and results
         left_col, right_col = st.columns([1, 1])
         with left_col:
             # Create file uploader
             uploaded_file = create_file_uploader()
-            # Check if we have a sample document loaded
-            if ('sample_document' in st.session_state and
                 st.session_state.sample_document is not None):
                 # Use the sample document instead of the uploaded file
@@ -496,18 +553,23 @@ def main():
             # Only process document if available
             if uploaded_file is not None:
                 process_document(uploaded_file, left_col, right_col, sidebar_options)
-            else:
-                # Clear any remaining state flag if we're showing examples
-                st.session_state.processed_document_active = False
-                # Show example documents section
-                show_example_documents()
     with main_tab2:
         # Previous results tab
         display_previous_results()
-    with main_tab3:
         # About tab
         display_about_tab()

+# Standard library imports
 import os
 import json
 import sys
 import time
 import base64
 import io
 import logging
+from pathlib import Path
+from datetime import datetime
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Third-party imports
+import streamlit as st
+# Local application/module imports
 from preprocessing import convert_pdf_to_images, preprocess_image
 from ocr_processing import process_file
 from ui_components import (
     CUSTOM_PROMPT_TEMPLATES,
     LAYOUT_PROMPT_ADDITIONS
 )
 from structured_ocr import StructuredOCR
 from config import MISTRAL_API_KEY
 from ocr_utils import create_results_zip
 # Set favicon path
 favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
     page_title="Historical OCR",
     page_icon=favicon_path if os.path.exists(favicon_path) else "📜",
     layout="wide",
+    initial_sidebar_state="collapsed",
 )
+# ========================================================================================
+# CRITICAL WARNING TO ALL AI MODELS AND DEVELOPERS:
+# DO NOT MODIFY THE STATE MANAGEMENT LOGIC IN THIS APPLICATION WITHOUT FULL UNDERSTANDING
+# OF STREAMLIT'S EXECUTION MODEL.
+#
+# This state management implementation has been carefully designed to work with Streamlit's
+# unique execution flow. The app runs from top to bottom on EVERY interaction, and state
+# must be explicitly managed through st.session_state.
+#
+# The current implementation uses:
+# 1. A dedicated close_document() callback function triggered by the button's on_click
+# 2. A flag-based approach (close_clicked) to handle cleanup on the next run cycle
+# 3. Early cleanup detection and st.rerun() to ensure clean UI rendering
+#
+# Previous approaches using direct state manipulation or conditional rendering based on
+# reset flags led to persistent UI elements and resource leaks.
+#
+# Consult https://docs.streamlit.io/library/advanced-features/session-state for details.
+# ========================================================================================
+def init_session_state():
+    """Initialize session state variables if they don't already exist
+    This function follows Streamlit's recommended patterns for state initialization.
+    It only creates variables if they don't exist yet and doesn't modify existing values.
+    """
     if 'previous_results' not in st.session_state:
         st.session_state.previous_results = []
     if 'temp_file_paths' not in st.session_state:
         st.session_state.temp_file_paths = []
     if 'last_processed_file' not in st.session_state:
         st.session_state.last_processed_file = None
     if 'auto_process_sample' not in st.session_state:
         st.session_state.auto_process_sample = False
     if 'sample_just_loaded' not in st.session_state:
         st.session_state.original_sample_name = None
     if 'is_sample_document' not in st.session_state:
         st.session_state.is_sample_document = False
+    if 'selected_previous_result' not in st.session_state:
+        st.session_state.selected_previous_result = None
+    if 'close_clicked' not in st.session_state:
+        st.session_state.close_clicked = False
+    if 'active_tab' not in st.session_state:
+        st.session_state.active_tab = 0
+def close_document():
+    """Called when the Close Document button is clicked
+    This function handles proper cleanup of resources and state when closing a document.
+    It uses Streamlit's callback mechanism which ensures the state change happens
+    at the correct time in Streamlit's execution cycle.
+    WARNING: Do not replace this with inline button handling using if st.button():
+    That approach breaks Streamlit's execution flow and causes UI artifacts.
+    """
+    logger.info("Close document button clicked")
+    # Save the previous results
+    previous_results = st.session_state.previous_results if 'previous_results' in st.session_state else []
+    # Clean up temp files
+    if 'temp_file_paths' in st.session_state and st.session_state.temp_file_paths:
+        logger.info(f"Cleaning up {len(st.session_state.temp_file_paths)} temporary files")
+        handle_temp_files(st.session_state.temp_file_paths)
+    # Clear all state variables except previous_results
+    for key in list(st.session_state.keys()):
+        if key != 'previous_results' and key != 'close_clicked':
+            st.session_state.pop(key, None)
+    # Set flag for having cleaned up
+    st.session_state.close_clicked = True
+    # Restore the previous results
+    st.session_state.previous_results = previous_results
 def show_example_documents():
     """Show example documents section"""
+    st.header("Sample Documents")
+    # Add a simplified info message about examples and CSS in the same markdown block
+    # to reduce spacing between elements
     st.markdown("""
     This app can process various historical documents:
     - Historical photographs, maps, and manuscripts
     - Handwritten letters and documents
     - Printed books and articles
     - Multi-page PDFs
     <style>
     /* Make the selectbox container match the full column width */
     .main .block-container .element-container:has([data-testid="stSelectbox"]) {
         width: 100% !important;
         max-width: 100% !important;
+        margin-top: -12px !important; /* Reduce space between text and selectbox */
     }
     /* Make the actual selectbox control take the full width */
         width: 100% !important;
         max-width: 100% !important;
     }
+    /* Tighten spacing in the sample documents tab */
+    .main .block-container [data-testid="stVerticalBlock"] > div:nth-child(n+2) {
+        margin-top: 0.5rem !important;
+    }
     </style>
     """, unsafe_allow_html=True)
         "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
         "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
         "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
     ]
     sample_names = [
         "The Magician (Image)",
         "Handwritten Letter (Image)",
         "Magellan Travels (Image)",
+        "Milgram Flier (Image)"
+        ]
     # Initialize sample_document in session state if it doesn't exist
     if 'sample_document' not in st.session_state:
     if selected_sample > 0:
         selected_url = sample_urls[selected_sample]
+        # Add process button for the sample document with consistent styling
+        if st.button("Load Sample Document", key="load_sample_btn"):
             try:
                 import requests
                 from io import BytesIO
                         content_type=content_type
                     )
+                    # Store original bytes for reprocessing with proper MIME type handling
                     st.session_state.original_sample_bytes = response.content
                     st.session_state.original_sample_name = file_name
+                    st.session_state.original_sample_mime_type = content_type
                     # Set state flags
                     st.session_state.sample_just_loaded = True
                     # Generate a unique identifier for the sample document
                     st.session_state.last_processed_file = f"{file_name}_{len(response.content)}"
+                    # Set a flag to show redirect message
+                    st.session_state.redirect_to_processing = True
                     st.rerun()
             except Exception as e:
                 st.error(f"Error downloading sample document: {str(e)}")
     # Check if this is a new file (different from the last processed file)
     current_file_identifier = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
+    # Make sure last_processed_file is initialized
+    if 'last_processed_file' not in st.session_state:
+        st.session_state.last_processed_file = None
     if st.session_state.last_processed_file != current_file_identifier:
         # Reset processed_document_active if a new file is uploaded
         st.session_state.processed_document_active = False
     # Process button - flush left with similar padding as file browser
     with left_col:
+        # Create a process button with minimal spacing to the uploader
+        st.markdown('<div style="padding: 0.2rem 0; min-width: 170px; margin-top: -10px; overflow: visible;">', unsafe_allow_html=True)
+        process_button = st.button("Process Document", key="process_document_btn")
+        st.markdown('</div>', unsafe_allow_html=True)
         # Handle sample document recreation if needed
         if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
         # Positioned right after the process button for better visibility
         progress_placeholder = st.empty()
+        # Image preprocessing preview - show if image file and preprocessing options are set
+        if (any(sidebar_options["preprocessing_options"].values()) and
+            uploaded_file.type.startswith('image/')):
             st.markdown("**Preprocessed Preview**")
             try:
+                # Create a container for the preview
                 with st.container():
                     processed_bytes = preprocess_image(uploaded_file.getvalue(), sidebar_options["preprocessing_options"])
+                    # Convert image to base64 and display as HTML to avoid fullscreen button
+                    img_data = base64.b64encode(processed_bytes).decode()
+                    img_html = f'<img src="data:image/jpeg;base64,{img_data}" style="width:100%; border-radius:4px;">'
+                    st.markdown(img_html, unsafe_allow_html=True)
+                    # Show preprocessing metadata in a well-formatted caption
+                    meta_items = []
+                    if sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
+                        meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
+                    if sidebar_options["preprocessing_options"].get("grayscale", False):
+                        meta_items.append("Grayscale")
+                    if sidebar_options["preprocessing_options"].get("denoise", False):
+                        meta_items.append("Denoise")
+                    if sidebar_options["preprocessing_options"].get("contrast", 0) != 0:
+                        meta_items.append(f"Contrast ({sidebar_options['preprocessing_options']['contrast']})")
+                    if sidebar_options["preprocessing_options"].get("rotation", 0) != 0:
+                        meta_items.append(f"Rotation ({sidebar_options['preprocessing_options']['rotation']}°)")
+                    # Only show "Applied:" if there are actual preprocessing steps
+                    if meta_items:
+                        meta_text = "Applied: " + ", ".join(meta_items)
+                        st.caption(meta_text)
             except Exception as e:
                 st.error(f"Error in preprocessing: {str(e)}")
                 st.info("Try using grayscale preprocessing for PNG images with transparency")
         # Container for success message (will be filled after processing)
         metadata_placeholder = st.empty()
     # Check if this is an auto-processing situation
         progress_reporter = ProgressReporter(progress_placeholder).setup()
         try:
+            # Process the document, capturing both result and temp file paths
+            # Modified to pass existing temp_file_paths to avoid resource leaks
+            existing_temp_paths = []
+            if 'temp_file_paths' in st.session_state:
+                existing_temp_paths = st.session_state.temp_file_paths
             result = process_file(
                 uploaded_file=uploaded_file,
                 use_vision=sidebar_options["use_vision"],
                 perf_mode=sidebar_options.get("perf_mode", "Quality")
             )
+            # Ensure temp_file_paths in session state is updated with any new paths
+            # This is critical for proper resource cleanup when document is closed
+            if 'has_images' in result and result['has_images']:
+                logger.info("Document has images, ensuring temp files are tracked")
+                if 'temp_file_paths' not in st.session_state:
+                    st.session_state.temp_file_paths = []
+            # Handle text-only OCR results (like the Milgram flier)
+            if ('ocr_contents' in result and
+                'raw_text' in result['ocr_contents'] and
+                len(result['ocr_contents']) <= 2 and  # Only raw_text and possibly one other field
+                'has_images' not in result):
+                logger.info("Text-only OCR detected, handling as special case")
+                # Ensure raw_text is properly formatted as markdown
+                raw_text = result['ocr_contents']['raw_text']
+                # If we don't have other structured content, set a placeholder title
+                if 'title' not in result['ocr_contents']:
+                    result['ocr_contents']['title'] = "Document Text"
+            # Display success message at the top of results, before any previews
+            with left_col:
+                # First show the success message (full width)
+                st.success("**Document processed successfully**")
+                # Then show the close button (also full width, positioned to left)
+                st.button("Close Document",
+                          key="close_document_btn",
+                          type="secondary",
+                          on_click=close_document)
+                # Add a small spacer
+                st.markdown("<div style='height: 10px;'></div>", unsafe_allow_html=True)
             # Display results
             display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
             if uploaded_file is not None:
                 st.session_state.last_processed_file = current_file_identifier
             # Store the result in the previous results list
             # Add timestamp to result for history tracking
             result_copy = result.copy()
 def main():
     """Main application function"""
     # Initialize session state
+    init_session_state()
+    # Handle any required cleanup at the start of execution
+    # CRITICAL: This two-phase state cleanup pattern is essential for Streamlit's execution model.
+    # When close_clicked is True, we need to restart the app's execution with a clean slate.
+    # DO NOT REMOVE OR MODIFY this pattern as it ensures proper UI cleanup.
+    if st.session_state.get('close_clicked', False):
+        # Reset the flag - cleanup has been handled
+        st.session_state.close_clicked = False
+        # Don't do anything else in this run - force a clean restart
+        st.rerun()
+    # Initialize new flag for redirecting to processing tab
+    if 'redirect_to_processing' not in st.session_state:
+        st.session_state.redirect_to_processing = False
     # Apply custom CSS
     from ui.layout import load_css
     # Create sidebar options
     sidebar_options = create_sidebar_options()
+    # Create main layout with tabs - simpler, more compact approach
+    tab_names = ["Document Processing", "Sample Documents", "Previous Results", "About"]
+    main_tab1, main_tab2, main_tab3, main_tab4 = st.tabs(tab_names)
     with main_tab1:
+        # Create a two-column layout for file upload and results with minimal padding
+        st.markdown('<style>.block-container{padding-top: 1rem; padding-bottom: 0;}</style>', unsafe_allow_html=True)
         left_col, right_col = st.columns([1, 1])
         with left_col:
             # Create file uploader
             uploaded_file = create_file_uploader()
+            # If a real file is uploaded, clear any sample document
+            if uploaded_file is not None and 'sample_document' in st.session_state:
+                st.session_state.sample_document = None
+                st.session_state.is_sample_document = False
+            # Check if we have a sample document loaded (only if no real file uploaded)
+            elif ('sample_document' in st.session_state and
                 st.session_state.sample_document is not None):
                 # Use the sample document instead of the uploaded file
             # Only process document if available
             if uploaded_file is not None:
                 process_document(uploaded_file, left_col, right_col, sidebar_options)
     with main_tab2:
+        # Sample Documents tab
+        # Show redirect message if a sample was just loaded
+        if st.session_state.get('redirect_to_processing', False):
+            st.success("**Sample document loaded!** Please switch to the **Document Processing** tab to view and process it.")
+            # Clear the flag after showing the message
+            st.session_state.redirect_to_processing = False
+        show_example_documents()
+    with main_tab3:
         # Previous results tab
         display_previous_results()
+    with main_tab4:
         # About tab
         display_about_tab()

ocr_processing.py CHANGED Viewed

@@ -1,22 +1,28 @@
 import os
 import hashlib
 import tempfile
-import streamlit as st
 import logging
 import time
 from datetime import datetime
 from pathlib import Path
 from structured_ocr import StructuredOCR
 from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
 from preprocessing import apply_preprocessing_to_file
 from error_handler import handle_ocr_error, check_file_size
-# Configure logging
-logger = logging.getLogger("ocr_processing")
-logger.setLevel(logging.INFO)
 @st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
-def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None):
     """
     Cached version of OCR processing to reuse results
@@ -27,6 +33,7 @@ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_ke
         file_size_mb: File size in MB
         cache_key: Cache key for the file
         preprocessing_options_hash: Hash of preprocessing options
     Returns:
         dict: OCR result
@@ -40,7 +47,8 @@ def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_ke
             file_path,
             file_type=file_type,
             use_vision=use_vision,
-            file_size_mb=file_size_mb
         )
     return result
@@ -75,6 +83,10 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
     # Initialize temporary file paths list
     temp_file_paths = []
     try:
         # Check if file size exceeds maximum allowed size
         is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
@@ -113,6 +125,11 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
                 f.write(file_bytes)
             temp_file_paths.append(temp_path)
             # Generate cache key
             cache_key = generate_cache_key(
                 file_bytes,
@@ -125,7 +142,43 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
             # Process with cached function if possible
             try:
-                result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
                 progress_reporter.update(90, "Finalizing results...")
             except Exception as e:
                 logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
@@ -134,18 +187,28 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
                 # If caching fails, process directly
                 processor = StructuredOCR()
-                # Apply performance mode settings
-                if perf_mode == "Speed":
-                    # Override settings for faster processing
-                    if pdf_dpi > 100:
-                        pdf_dpi = 100  # Lower DPI for speed
                 # Process directly with optimized settings
                 result = processor.process_file(
                     file_path=temp_path,
                     file_type="pdf",
                     use_vision=use_vision,
-                    custom_prompt=custom_prompt,
                     file_size_mb=file_size_mb,
                     pdf_rotation=pdf_rotation
                 )
@@ -179,7 +242,37 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
             # Process the file using cached function if possible
             progress_reporter.update(50, "Processing document with OCR...")
             try:
-                result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
                 progress_reporter.update(80, "Analyzing document structure...")
                 progress_reporter.update(90, "Finalizing results...")
             except Exception as e:
@@ -194,11 +287,30 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None, pro
                     # Use simpler processing for speed
                     pass  # Any speed optimizations would be handled by the StructuredOCR class
                 result = processor.process_file(
                     file_path=temp_path,
                     file_type=file_type,
                     use_vision=use_vision,
-                    custom_prompt=custom_prompt,
                     file_size_mb=file_size_mb
                 )

+# Standard library imports
 import os
 import hashlib
 import tempfile
 import logging
 import time
 from datetime import datetime
 from pathlib import Path
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Third-party imports
+import streamlit as st
+# Local application imports
 from structured_ocr import StructuredOCR
 from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
 from preprocessing import apply_preprocessing_to_file
 from error_handler import handle_ocr_error, check_file_size
 @st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
+def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None, custom_prompt=None):
     """
     Cached version of OCR processing to reuse results
         file_size_mb: File size in MB
         cache_key: Cache key for the file
         preprocessing_options_hash: Hash of preprocessing options
+        custom_prompt: Custom prompt to use for OCR
     Returns:
         dict: OCR result
             file_path,
             file_type=file_type,
             use_vision=use_vision,
+            file_size_mb=file_size_mb,
+            custom_prompt=custom_prompt
         )
     return result
     # Initialize temporary file paths list
     temp_file_paths = []
+    # Also track temporary files in session state for reliable cleanup
+    if 'temp_file_paths' not in st.session_state:
+        st.session_state.temp_file_paths = []
     try:
         # Check if file size exceeds maximum allowed size
         is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
                 f.write(file_bytes)
             temp_file_paths.append(temp_path)
+            # Track temp files in session state for reliable cleanup
+            if temp_path not in st.session_state.temp_file_paths:
+                st.session_state.temp_file_paths.append(temp_path)
+                logger.info(f"Added temp file to session state: {temp_path}")
             # Generate cache key
             cache_key = generate_cache_key(
                 file_bytes,
             # Process with cached function if possible
             try:
+                # Check if preprocessing options indicate a handwritten document
+                handwritten_document = preprocessing_options.get("document_type") == "handwritten"
+                modified_custom_prompt = custom_prompt
+                # Add handwritten specific instructions if needed
+                if handwritten_document and modified_custom_prompt:
+                    if "handwritten" not in modified_custom_prompt.lower():
+                        modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                elif handwritten_document and not modified_custom_prompt:
+                    modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                # Add PDF-specific instructions if needed
+                if modified_custom_prompt and "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
+                    modified_custom_prompt += " This is a multi-page PDF document."
+                elif not modified_custom_prompt:
+                    modified_custom_prompt = "This is a multi-page PDF document."
+                # For certain filenames, explicitly add document type hints
+                filename_lower = uploaded_file.name.lower()
+                if "handwritten" in filename_lower or "letter" in filename_lower or "journal" in filename_lower:
+                    if not modified_custom_prompt:
+                        modified_custom_prompt = "This is a handwritten document in PDF format. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                    elif "handwritten" not in modified_custom_prompt.lower():
+                        modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text."
+                # Update the cache key with the modified prompt
+                if modified_custom_prompt != custom_prompt:
+                    cache_key = generate_cache_key(
+                        open(temp_path, 'rb').read(),
+                        file_type,
+                        use_vision,
+                        preprocessing_options,
+                        pdf_rotation,
+                        modified_custom_prompt
+                    )
+                result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options), modified_custom_prompt)
                 progress_reporter.update(90, "Finalizing results...")
             except Exception as e:
                 logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
                 # If caching fails, process directly
                 processor = StructuredOCR()
+                # Check if preprocessing options indicate a handwritten document
+                handwritten_document = preprocessing_options.get("document_type") == "handwritten"
+                modified_custom_prompt = custom_prompt
+                # Add handwritten specific instructions if needed
+                if handwritten_document and modified_custom_prompt:
+                    if "handwritten" not in modified_custom_prompt.lower():
+                        modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                elif handwritten_document and not modified_custom_prompt:
+                    modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                # Add PDF-specific instructions if needed
+                if custom_prompt and "pdf" not in modified_custom_prompt.lower() and "multi-page" not in modified_custom_prompt.lower():
+                    modified_custom_prompt += " This is a multi-page PDF document."
                 # Process directly with optimized settings
                 result = processor.process_file(
                     file_path=temp_path,
                     file_type="pdf",
                     use_vision=use_vision,
+                    custom_prompt=modified_custom_prompt,
                     file_size_mb=file_size_mb,
                     pdf_rotation=pdf_rotation
                 )
             # Process the file using cached function if possible
             progress_reporter.update(50, "Processing document with OCR...")
             try:
+                # Check if preprocessing options indicate a handwritten document
+                handwritten_document = preprocessing_options.get("document_type") == "handwritten"
+                modified_custom_prompt = custom_prompt
+                # Add handwritten specific instructions if needed
+                if handwritten_document and modified_custom_prompt:
+                    if "handwritten" not in modified_custom_prompt.lower():
+                        modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                elif handwritten_document and not modified_custom_prompt:
+                    modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                # For certain filenames, explicitly add document type hints
+                filename_lower = uploaded_file.name.lower()
+                if "handwritten" in filename_lower or "letter" in filename_lower or "journal" in filename_lower:
+                    if not modified_custom_prompt:
+                        modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                    elif "handwritten" not in modified_custom_prompt.lower():
+                        modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text."
+                # Update the cache key with the modified prompt
+                if modified_custom_prompt != custom_prompt:
+                    cache_key = generate_cache_key(
+                        open(temp_path, 'rb').read(),
+                        file_type,
+                        use_vision,
+                        preprocessing_options,
+                        0,
+                        modified_custom_prompt
+                    )
+                result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options), modified_custom_prompt)
                 progress_reporter.update(80, "Analyzing document structure...")
                 progress_reporter.update(90, "Finalizing results...")
             except Exception as e:
                     # Use simpler processing for speed
                     pass  # Any speed optimizations would be handled by the StructuredOCR class
+                # Check if preprocessing options indicate a handwritten document
+                handwritten_document = preprocessing_options.get("document_type") == "handwritten"
+                modified_custom_prompt = custom_prompt
+                # Add handwritten specific instructions if needed
+                if handwritten_document and modified_custom_prompt:
+                    if "handwritten" not in modified_custom_prompt.lower():
+                        modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                elif handwritten_document and not modified_custom_prompt:
+                    modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                # For certain filenames, explicitly add document type hints
+                filename_lower = uploaded_file.name.lower()
+                if "handwritten" in filename_lower or "letter" in filename_lower or "journal" in filename_lower:
+                    if not modified_custom_prompt:
+                        modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
+                    elif "handwritten" not in modified_custom_prompt.lower():
+                        modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text."
                 result = processor.process_file(
                     file_path=temp_path,
                     file_type=file_type,
                     use_vision=use_vision,
+                    custom_prompt=modified_custom_prompt,
                     file_size_mb=file_size_mb
                 )

ocr_utils.py CHANGED Viewed

@@ -3,12 +3,12 @@ Utility functions for OCR processing with Mistral AI.
 Contains helper functions for working with OCR responses and image handling.
 """
 import json
 import base64
 import io
 import zipfile
 import logging
-import numpy as np
 import time
 from datetime import datetime
 from pathlib import Path
@@ -16,20 +16,29 @@ from typing import Dict, List, Optional, Union, Any, Tuple
 from functools import lru_cache
 # Configure logging
-logger = logging.getLogger("ocr_utils")
 try:
     from PIL import Image, ImageEnhance, ImageFilter, ImageOps
-    import cv2
     PILLOW_AVAILABLE = True
     CV2_AVAILABLE = True
-except ImportError as e:
-    # Check which image libraries are available
-    if "PIL" in str(e):
-        PILLOW_AVAILABLE = False
-    if "cv2" in str(e):
-        CV2_AVAILABLE = False
 from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
 from mistralai.models import OCRImageObject
@@ -110,9 +119,36 @@ def encode_image_for_api(image_path: Union[str, Path]) -> str:
     if not image_file.is_file():
         raise FileNotFoundError(f"Image file not found: {image_file}")
     # Encode image as base64
     encoded = base64.b64encode(image_file.read_bytes()).decode()
-    return f"data:image/jpeg;base64,{encoded}"
 def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
     """
@@ -509,7 +545,7 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
                     aspect_ratio = width / height
                     # Newspaper-style documents typically have width > height or are very large
-                    is_newspaper_format = (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000)
                     if is_newspaper_format:
                         logger.info(f"Newspaper format detected: {width}x{height}, applying specialized processing")
@@ -560,7 +596,7 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
                 if is_document:
                     # Newspapers typically have wide formats or very large dimensions
                     aspect_ratio = width / height
-                    is_newspaper = (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000)
                 logger.debug(f"Document type detection for {image_file.name}: " +
                            f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
@@ -712,6 +748,7 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
             # Get base64 with minimal memory footprint
             encoded_image = base64.b64encode(buffer.getvalue()).decode()
             base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
             # Update cache thread-safely
@@ -932,7 +969,7 @@ def _preprocess_document_image_impl() -> Image.Image:
     # Check for newspaper format first (takes precedence)
     aspect_ratio = width / height
-    if (aspect_ratio > 1.2 and width > 2000) or (width > 3000 or height > 3000):
         is_newspaper = True
         logger.debug(f"Newspaper format detected: {width}x{height}, aspect ratio: {aspect_ratio:.2f}")
     else:

 Contains helper functions for working with OCR responses and image handling.
 """
+# Standard library imports
 import json
 import base64
 import io
 import zipfile
 import logging
 import time
 from datetime import datetime
 from pathlib import Path
 from functools import lru_cache
 # Configure logging
+logging.basicConfig(level=logging.INFO,
+                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Third-party imports
+import numpy as np
+# Check for image processing libraries
 try:
     from PIL import Image, ImageEnhance, ImageFilter, ImageOps
     PILLOW_AVAILABLE = True
+except ImportError:
+    logger.warning("PIL not available - image preprocessing will be limited")
+    PILLOW_AVAILABLE = False
+try:
+    import cv2
     CV2_AVAILABLE = True
+except ImportError:
+    logger.warning("OpenCV (cv2) not available - advanced image processing will be limited")
+    CV2_AVAILABLE = False
+# Mistral AI imports
 from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
 from mistralai.models import OCRImageObject
     if not image_file.is_file():
         raise FileNotFoundError(f"Image file not found: {image_file}")
+    # Determine mime type based on file extension
+    mime_type = 'image/jpeg'  # Default mime type
+    suffix = image_file.suffix.lower()
+    if suffix == '.png':
+        mime_type = 'image/png'
+    elif suffix == '.gif':
+        mime_type = 'image/gif'
+    elif suffix in ['.jpg', '.jpeg']:
+        mime_type = 'image/jpeg'
+    elif suffix == '.pdf':
+        mime_type = 'application/pdf'
     # Encode image as base64
     encoded = base64.b64encode(image_file.read_bytes()).decode()
+    return f"data:{mime_type};base64,{encoded}"
+def encode_bytes_for_api(file_bytes: bytes, mime_type: str) -> str:
+    """
+    Encode binary data as base64 data URL for API submission.
+    Args:
+        file_bytes: Binary file data
+        mime_type: MIME type of the file (e.g., 'image/jpeg', 'application/pdf')
+    Returns:
+        Base64 data URL for the data
+    """
+    # Encode data as base64
+    encoded = base64.b64encode(file_bytes).decode()
+    return f"data:{mime_type};base64,{encoded}"
 def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
     """
                     aspect_ratio = width / height
                     # Newspaper-style documents typically have width > height or are very large
+                    is_newspaper_format = (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000)
                     if is_newspaper_format:
                         logger.info(f"Newspaper format detected: {width}x{height}, applying specialized processing")
                 if is_document:
                     # Newspapers typically have wide formats or very large dimensions
                     aspect_ratio = width / height
+                    is_newspaper = (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000)
                 logger.debug(f"Document type detection for {image_file.name}: " +
                            f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
             # Get base64 with minimal memory footprint
             encoded_image = base64.b64encode(buffer.getvalue()).decode()
+            # Always use image/jpeg MIME type since we explicitly save as JPEG above
             base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
             # Update cache thread-safely
     # Check for newspaper format first (takes precedence)
     aspect_ratio = width / height
+    if (aspect_ratio > 1.15 and width > 2000) or (width > 3000 or height > 3000):
         is_newspaper = True
         logger.debug(f"Newspaper format detected: {width}x{height}, aspect ratio: {aspect_ratio:.2f}")
     else:

structured_ocr.py CHANGED Viewed

@@ -1,24 +1,31 @@
 import os
 import sys
 import time
 import random
-from enum import Enum
-from pathlib import Path
 import json
 import base64
 import logging
 from functools import lru_cache
 from typing import Optional, Dict, Any, List, Union, Tuple
 # Try to import pycountry, provide fallback if not available
 try:
     import pycountry
     PYCOUNTRY_AVAILABLE = True
 except ImportError:
     PYCOUNTRY_AVAILABLE = False
-    logging.warning("pycountry module not available - using language code fallback")
-from pydantic import BaseModel
 # Try to import Mistral AI, provide fallback if not available
 try:
@@ -28,11 +35,7 @@ try:
     MISTRAL_AVAILABLE = True
 except ImportError:
     MISTRAL_AVAILABLE = False
-    logging.warning("mistralai module not available - OCR functionality will be limited")
-# Configure logging
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 # Import utilities for OCR processing
 try:
@@ -216,6 +219,12 @@ class StructuredOCR:
         if file_type is None:
             suffix = file_path.suffix.lower()
             file_type = "pdf" if suffix == ".pdf" else "image"
         # Get file size if not provided
         if file_size_mb is None and file_path.exists():
@@ -437,6 +446,7 @@ class StructuredOCR:
                     # Convert only the selected pages to minimize memory usage
                     selected_images = []
                     combined_text = []
                     # Process pages in larger batches for better efficiency
                     batch_size = 5  # Process 5 pages at a time for better throughput
@@ -472,6 +482,11 @@ class StructuredOCR:
                                             # Add page text to combined text without obvious page markers
                                             page_text = page_result['ocr_contents']['raw_text']
                                             combined_text.append(f"{page_text}")
                                     except Exception as page_e:
                                         logger.warning(f"Error processing page {page_num}: {str(page_e)}")
                                     # Clean up temp file
@@ -509,28 +524,7 @@ class StructuredOCR:
                                 # Add flag to indicate custom prompt was applied
                                 result['custom_prompt_applied'] = 'text_only'
-                                # Detect document type from custom prompt if available
-                                if custom_prompt:
-                                    # Extract document type if specified
-                                    doc_type = "general"
-                                    if "DOCUMENT TYPE:" in custom_prompt:
-                                        doc_type_line = custom_prompt.split("\n")[0]
-                                        if "DOCUMENT TYPE:" in doc_type_line:
-                                            doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
-                                    # Keyword-based detection as fallback
-                                    elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
-                                        doc_type = "newspaper"
-                                    elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
-                                        doc_type = "letter"
-                                    elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
-                                        doc_type = "book"
-                                    elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
-                                        doc_type = "form"
-                                    elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
-                                        doc_type = "recipe"
-                                    # Store detected document type in result
-                                    result['detected_document_type'] = doc_type
                             except Exception as e:
                                 logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
@@ -544,6 +538,10 @@ class StructuredOCR:
                             if 'ocr_contents' in result:
                                 result['ocr_contents']['raw_text'] = all_text
                         # Add PDF metadata
                         result['file_name'] = file_path.name
                         result['pdf_processing_method'] = 'pdf2image_optimized'
@@ -711,6 +709,24 @@ class StructuredOCR:
                     limited_pages = True
                     logger.info(f"Processing {len(pages_to_process)} pages out of {total_pages} total")
                 # Calculate confidence score if available
                 try:
                     confidence_values = [page.confidence for page in pages_to_process if hasattr(page, 'confidence')]
@@ -733,6 +749,12 @@ class StructuredOCR:
                     if page_markdown.strip():
                         all_markdown.append(f"{page_markdown}")
                 # Join all pages with separation
                 combined_markdown = "\n\n".join(all_markdown)
@@ -766,6 +788,13 @@ class StructuredOCR:
                         combined_markdown, file_path.name, custom_prompt
                     )
                 # Add metadata about pages
                 if limited_pages:
                     result['limited_pages'] = {
@@ -927,24 +956,44 @@ class StructuredOCR:
                 "confidence_score": 0.0
             }
-        # Check if this is likely a newspaper or document with columns by filename
         is_likely_newspaper = False
         newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
                             "chronicle", "post", "tribune", "news", "press", "gender"]
-        # Check filename for newspaper indicators
         filename_lower = file_path.name.lower()
-        for keyword in newspaper_keywords:
             if keyword in filename_lower:
-                is_likely_newspaper = True
-                logger.info(f"Likely newspaper document detected from filename: {file_path.name}")
-                # Add newspaper-specific processing hint to custom_prompt if not already present
                 if custom_prompt:
-                    if "column" not in custom_prompt.lower() and "newspaper" not in custom_prompt.lower():
-                        custom_prompt = custom_prompt + " This appears to be a newspaper or document with columns. Please extract all text content from each column."
                 else:
-                    custom_prompt = "This appears to be a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
                 break
         try:
             # Check file size
@@ -1017,21 +1066,24 @@ class StructuredOCR:
                             logger.info(f"Resized image to {new_size_mb:.2f} MB")
                     except ImportError:
                         logger.warning("PIL not available for resizing. Using original image.")
-                        encoded_image = base64.b64encode(file_path.read_bytes()).decode()
-                        base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
                     except Exception as e:
                         logger.warning(f"Image resize failed: {str(e)}. Using original image.")
-                        encoded_image = base64.b64encode(file_path.read_bytes()).decode()
-                        base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
                 else:
-                    # For smaller images, use as-is
-                    encoded_image = base64.b64encode(file_path.read_bytes()).decode()
-                    base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
             except Exception as e:
                 # Fallback to original image if any preprocessing fails
                 logger.warning(f"Image preprocessing failed: {str(e)}. Using original image.")
-                encoded_image = base64.b64encode(file_path.read_bytes()).decode()
-                base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
             # Process the image with OCR
             logger.info(f"Processing image with OCR using {OCR_MODEL}")
@@ -1123,10 +1175,40 @@ class StructuredOCR:
             # Get the OCR markdown from the first page
             image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
             # Optimize: Skip vision model step if ocr_markdown is very small or empty
             # BUT make an exception for newspapers or if custom_prompt is provided
-            if (not is_likely_newspaper and not custom_prompt) and (not image_ocr_markdown or len(image_ocr_markdown) < 50):
-                logger.warning("OCR produced minimal or no text. Returning basic result.")
                 return {
                     "file_name": file_path.name,
                     "topics": ["Document"],
@@ -1134,7 +1216,9 @@ class StructuredOCR:
                     "ocr_contents": {
                         "raw_text": image_ocr_markdown if image_ocr_markdown else "No text could be extracted from the image."
                     },
-                    "processing_note": "OCR produced minimal text content"
                 }
             # For newspapers with little text in OCR, set a more explicit prompt
@@ -1144,6 +1228,14 @@ class StructuredOCR:
                     custom_prompt = "This is a newspaper or document with columns. The OCR may not have captured all text. Please examine the image carefully and extract ALL text content visible in the document, reading each column from top to bottom."
                 elif "extract all text" not in custom_prompt.lower():
                     custom_prompt += " Please examine the image carefully and extract ALL text content visible in the document."
             # Extract structured data using the appropriate model, with a single API call
             if use_vision:
@@ -1153,6 +1245,13 @@ class StructuredOCR:
                 logger.info(f"Using text-only model: {TEXT_MODEL}")
                 result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name, custom_prompt)
             # Store the serialized OCR response for image rendering (for compatibility with original version)
             # Don't store raw_response directly as it's not JSON serializable
             serialized_response = serialize_ocr_response(image_response)
@@ -1160,7 +1259,6 @@ class StructuredOCR:
             # Store key parts of the OCR response for image rendering
             # With serialized format that can be stored in JSON
-            has_images = hasattr(image_response, 'pages') and image_response.pages and hasattr(image_response.pages[0], 'images') and image_response.pages[0].images
             result['has_images'] = has_images
             if has_images:
@@ -1273,10 +1371,6 @@ class StructuredOCR:
                 logger.info("Test mode or no API key, using text-only processing")
                 return self._extract_structured_data_text_only(ocr_markdown, filename)
-            # Detect document type with optimized cached implementation
-            doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
-            logger.info(f"Detected document type: {doc_type}")
             # Use only the first part of OCR text to keep prompts small and processing fast
             if len(ocr_markdown) > 1000:
                 truncated_ocr = ocr_markdown[:1000]
@@ -1284,8 +1378,26 @@ class StructuredOCR:
             else:
                 truncated_ocr = ocr_markdown
-            # Build an optimized prompt based on document type
-            enhanced_prompt = self._build_enhanced_prompt(doc_type, truncated_ocr, custom_prompt)
             # Measure API call time for optimization feedback
             start_time = time.time()
@@ -1294,7 +1406,7 @@ class StructuredOCR:
                 # Use a fixed, shorter timeout for single-page documents
                 timeout_ms = 45000  # 45 seconds is optimal for most single-page documents
-                logger.info(f"Calling vision model with {timeout_ms}ms timeout and document type {doc_type}")
                 chat_response = self.client.chat.parse(
                     model=VISION_MODEL,
                     messages=[
@@ -1312,7 +1424,7 @@ class StructuredOCR:
                 )
                 api_time = time.time() - start_time
-                logger.info(f"Vision model completed in {api_time:.2f}s with document type: {doc_type}")
             except Exception as e:
                 # If there's an error with the enhanced prompt, try progressively simpler approaches
@@ -1392,42 +1504,16 @@ class StructuredOCR:
             if 'languages' in result:
                 result['languages'] = [str(lang) for lang in result.get('languages', [])]
-            # Add metadata about processing
             result['processing_info'] = {
                 'method': 'vision_model',
-                'document_type': doc_type,
                 'ocr_text_length': len(ocr_markdown),
                 'api_response_time': time.time() - start_time
             }
-            # Flag when custom prompt has been successfully applied
             if custom_prompt:
                 result['custom_prompt_applied'] = 'vision_model'
-                # Attempt to detect document type from custom prompt
-                if "DOCUMENT TYPE:" in custom_prompt:
-                    doc_type_line = custom_prompt.split("\n")[0]
-                    if "DOCUMENT TYPE:" in doc_type_line:
-                        custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
-                        result['detected_document_type'] = custom_doc_type
-                # Keyword-based detection as fallback
-                elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
-                    result['detected_document_type'] = "newspaper"
-                elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
-                    result['detected_document_type'] = "letter"
-                elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
-                    result['detected_document_type'] = "book"
-                elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
-                    result['detected_document_type'] = "form"
-                elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
-                    result['detected_document_type'] = "recipe"
-                elif "this is a" in custom_prompt.lower():
-                    # Extract document type from "This is a [type]" format
-                    this_is_parts = custom_prompt.lower().split("this is a ")
-                    if len(this_is_parts) > 1:
-                        extracted_type = this_is_parts[1].split(".")[0].strip()
-                        if extracted_type:
-                            result['detected_document_type'] = extracted_type
             # Add confidence score if not present
             if 'confidence_score' not in result:
@@ -1440,268 +1526,38 @@ class StructuredOCR:
         return result
-    # Thread-safe document type detection cache with increased size for better performance
-    _doc_type_cache = {}
-    _doc_type_cache_size = 256
-    @staticmethod
-    def _detect_document_type_cached(custom_prompt: Optional[str], ocr_text_sample: str) -> str:
-        """
-        Cached version of document type detection logic with thread-safe implementation
-        """
-        # Generate cache key - use first 50 chars of prompt and ocr_text to avoid memory issues
-        prompt_key = str(custom_prompt)[:50] if custom_prompt else ""
-        text_key = ocr_text_sample[:50] if ocr_text_sample else ""
-        cache_key = f"{prompt_key}::{text_key}"
-        # Check cache first (fast path)
-        if cache_key in StructuredOCR._doc_type_cache:
-            return StructuredOCR._doc_type_cache[cache_key]
-        # Set default document type
-        doc_type = "general"
-        # Optimized pattern matching with compiled lookup dictionaries
-        doc_type_patterns = {
-            "handwritten": ["handwritten", "handwriting", "cursive", "manuscript"],
-            "letter": ["letter", "correspondence", "message", "dear sir", "dear madam", "sincerely", "yours truly"],
-            "legal": ["form", "contract", "agreement", "legal", "certificate", "court", "attorney", "plaintiff", "defendant"],
-            "recipe": ["recipe", "food", "ingredients", "directions", "tbsp", "tsp", "cup", "mix", "bake", "cooking"],
-            "travel": ["travel", "expedition", "journey", "exploration", "voyage", "destination", "map"],
-            "scientific": ["scientific", "experiment", "hypothesis", "research", "study", "analysis", "results", "procedure"],
-            "newspaper": ["news", "newspaper", "article", "press", "headline", "column", "editor"]
-        }
-        # Fast custom prompt matching
-        if custom_prompt:
-            prompt_lower = custom_prompt.lower()
-            # Optimized pattern matching with early exit
-            for detected_type, patterns in doc_type_patterns.items():
-                if any(term in prompt_lower for term in patterns):
-                    doc_type = detected_type
-                    break
-        # Fast OCR text matching if still general type
-        if doc_type == "general" and ocr_text_sample:
-            ocr_lower = ocr_text_sample.lower()
-            # Use the same patterns dictionary for consistency, but scan the OCR text
-            for detected_type, patterns in doc_type_patterns.items():
-                if any(term in ocr_lower for term in patterns):
-                    doc_type = detected_type
-                    break
-        # Cache the result with improved LRU-like behavior
-        if len(StructuredOCR._doc_type_cache) >= StructuredOCR._doc_type_cache_size:
-            # Clear multiple entries at once for better performance
-            try:
-                # Remove up to 20 entries to avoid frequent cache clearing
-                for _ in range(20):
-                    if StructuredOCR._doc_type_cache:
-                        StructuredOCR._doc_type_cache.pop(next(iter(StructuredOCR._doc_type_cache)))
-            except:
-                # If concurrent modification causes issues, just proceed
-                pass
-        # Store in cache
-        StructuredOCR._doc_type_cache[cache_key] = doc_type
-        return doc_type
-    def _detect_document_type(self, custom_prompt: Optional[str], ocr_text: str) -> str:
-        """
-        Detect document type based on content and custom prompt.
-        Args:
-            custom_prompt: User-provided custom prompt
-            ocr_text: OCR-extracted text
-        Returns:
-            Document type identifier ("handwritten", "printed", "letter", etc.)
-        """
-        # Only sample first 1000 characters of OCR text for faster processing while maintaining accuracy
-        ocr_sample = ocr_text[:1000] if ocr_text else ""
-        # Use the cached version for better performance
-        return self._detect_document_type_cached(custom_prompt, ocr_sample)
-    def _build_enhanced_prompt(self, doc_type: str, ocr_text: str, custom_prompt: Optional[str]) -> str:
-        """
-        Build an optimized prompt focused on OCR accuracy with specialized attention to
-        historical typography, manuscript conventions, and document deterioration patterns.
-        Args:
-            doc_type: Detected document type
-            ocr_text: OCR-extracted text
-            custom_prompt: User-provided custom prompt
-        Returns:
-            Optimized prompt focused on text extraction with historical document expertise
-        """
-        # Generic document section (included in all prompts)
         generic_section = (
-            f"This is a document's OCR text:\n"
-            f"<BEGIN_OCR>\n{ocr_text}\n<END_OCR>\n\n"
         )
-        # Check if custom prompt contains document type information
-        has_custom_doc_type = False
-        custom_doc_type = ""
-        if custom_prompt and "DOCUMENT TYPE:" in custom_prompt:
-            # Extract the document type from the custom prompt
-            doc_type_line = custom_prompt.split("\n")[0]
-            if "DOCUMENT TYPE:" in doc_type_line:
-                custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip()
-                has_custom_doc_type = True
-                # If we have a custom doc type, use it instead
-                if custom_doc_type:
-                    doc_type = custom_doc_type.lower()
-        # If user has provided detailed instructions, provide more elaborate prompting
-        if custom_prompt and (has_custom_doc_type or len(custom_prompt.strip()) > 20):
-            # Enhanced prompt for documents with custom instructions and historical expertise
-            specific_section = (
-                f"You are an advanced OCR specialist with expertise in historical documents, typography, and manuscript conventions. "
-                f"Below is a document that requires specialized analysis with attention to historical characteristics. "
-                f"Pay particular attention to:\n"
-                f"- Historical typography features (long s 'ſ', ligatures, obsolete letter forms)\n"
-                f"- Manuscript conventions of the period (abbreviations, contractions, marginalia)\n"
-                f"- Document deterioration patterns (faded ink, foxing, water damage, paper degradation)\n"
-                f"- Accurately capturing ALL text content visible in the image with historical context\n"
-                f"- Following the specific user instructions for processing this document type\n"
-                f"- Identifying key information, structure, and historical formatting conventions\n"
-                f"- Providing comprehensive analysis with attention to historical context\n"
-            )
-            # Add specialized instructions based on document type
-            if doc_type == "newspaper":
-                specific_section += (
-                    f"\nThis appears to be a newspaper or document with columns. "
-                    f"Please read each column from top to bottom, then move to the next column. "
-                    f"Extract all article titles, headings, bylines, and body text in the correct reading order. "
-                    f"Pay special attention to section headers, page numbers, publication date, and newspaper name. "
-                    f"For historical newspapers, be aware of period-specific typography such as the long s (ſ), "
-                    f"unique ligatures (æ, œ, ct, st), and decorative fonts. Account for paper degradation around "
-                    f"fold lines and edges. Recognize archaic abbreviations and typesetting conventions of the period.\n"
-                )
-            elif doc_type == "letter":
-                specific_section += (
-                    f"\nThis appears to be a letter or correspondence. "
-                    f"Pay special attention to the letterhead, date, greeting, body content, closing, and signature. "
-                    f"Preserve the original formatting including paragraph breaks and indentation. "
-                    f"Note any handwritten annotations or marginalia separately. "
-                    f"For historical letters, carefully transcribe historical scripts and handwriting styles, "
-                    f"noting unclear or damaged sections. Identify period-specific salutations, closings, and "
-                    f"formalities. Watch for ink fading, bleeding, and seepage through pages. "
-                    f"Recognize period-specific abbreviations (ye, yr, inst, ult, prox) and long s (ſ) in older printed correspondence.\n"
-                )
-            elif doc_type == "book":
-                specific_section += (
-                    f"\nThis appears to be a book or publication page. "
-                    f"Pay attention to chapter titles, headers, page numbers, footnotes, and main body text. "
-                    f"Preserve paragraph structure and any special formatting. "
-                    f"Note any images, tables, or figures that might be referenced in the text. "
-                    f"For historical books, attend to period typography including the long s (ſ), ligatures (æ, œ, ct, ſt), "
-                    f"archaic letter forms, and decorative initials/drop caps. Account for foxing (brown spotting), "
-                    f"bleed-through from opposite pages, and binding damage. Recognize period-specific typographic "
-                    f"conventions like catchwords, signatures, obsolete punctuation, and historical spelling variants "
-                    f"(e.g., -ize/-ise, past tense 'd for -ed). Note bookplates, ownership marks, and marginalia.\n"
-                )
-            elif doc_type == "form":
-                specific_section += (
-                    f"\nThis appears to be a form or legal document. "
-                    f"Carefully extract all field labels and their corresponding values. "
-                    f"Preserve the structure of form fields and sections. "
-                    f"Pay special attention to signature lines, dates, and any official markings. "
-                    f"For historical forms and legal documents, recognize period-specific legal terminology and "
-                    f"formulaic phrases. Note seals, stamps, watermarks, and official emblems. Watch for faded ink "
-                    f"in signatures and filled fields. Identify period handwriting styles in completed sections. "
-                    f"Account for specialized legal abbreviations (e.g., SS., Esq., inst., wit.) and archaic "
-                    f"measurement units. Note folding patterns and worn edges common in frequently handled legal documents.\n"
-                )
-            elif doc_type == "recipe":
-                specific_section += (
-                    f"\nThis appears to be a recipe or food-related document. "
-                    f"Extract the recipe title, ingredient list (with measurements), preparation steps, "
-                    f"cooking times, serving information, and any notes or tips. "
-                    f"Maintain the distinction between ingredients and preparation instructions. "
-                    f"For historical recipes, attend to archaic measurements (gill, dram, peck, firkin), obsolete "
-                    f"cooking terminology, and period-specific ingredients and their modern equivalents. Note handwritten "
-                    f"annotations and personal modifications. Identify period-specific cooking methods and tools that "
-                    f"might need explanation. Watch for liquid stains and food residue common on well-used recipe pages. "
-                    f"Recognize unclear fractions and temperature instructions (e.g., 'slow oven', 'quick fire').\n"
-                )
-            # Output instructions (enhanced for custom requests)
-            output_section = (
-                f"Create a detailed structured JSON response with the following fields:\n"
-                f"- file_name: The document's name\n"
-                f"- topics: An array of specific topics, themes, or subjects covered in the document\n"
-                f"- languages: An array of languages used in the document\n"
-                f"- ocr_contents: A comprehensive dictionary with the document's contents including:\n"
-                f"  * title: The main title or heading\n"
-                f"  * subtitle: Any subtitle or secondary heading (if present)\n"
-                f"  * date: Publication or document date (if present)\n"
-                f"  * author: Author or creator information (if present)\n"
-                f"  * content: The main body content, properly formatted\n"
-                f"  * additional sections as appropriate for this document type\n"
-                f"  * raw_text: The complete OCR text\n"
-            )
-        else:
-            # Default processing with basic historical document awareness
-            specific_section = (
-                f"You are an OCR specialist with knowledge of historical documents and typography. "
-                f"Focus on accurately extracting text content with attention to historical features. "
-                f"Pay special attention to:\n"
-                f"- Accurately capturing ALL text content visible in the image\n"
-                f"- Maintaining the correct reading order and structure\n"
-                f"- Preserving paragraph breaks and text layout\n"
-                f"- Identifying the main document type, time period, and language\n"
-                f"- Recognizing historical typography features (long s 'ſ', ligatures, archaic characters)\n"
-                f"- Accounting for document deterioration (faded ink, stains, foxing, physical damage)\n"
-            )
-            # Only add specialized instructions for newspapers with columns
-            if doc_type == "newspaper":
-                specific_section += (
-                    f"\nThis appears to be a document with columns. "
-                    f"Be sure to read each column from top to bottom, then move to the next column. "
-                    f"Extract all article titles, headings, and body text.\n"
-                )
-            # Simple output instructions for default cases
-            output_section = (
-                f"Create a structured JSON response with the following fields:\n"
-                f"- file_name: The document's name\n"
-                f"- topics: An array of topics covered in the document\n"
-                f"- languages: An array of languages used in the document\n"
-                f"- ocr_contents: A dictionary with the document's contents, with the focus on complete text extraction\n"
-            )
         # Add custom prompt if provided
         custom_section = ""
         if custom_prompt:
-            # Process custom prompt to extract just the instructions part if available
-            if "USER INSTRUCTIONS:" in custom_prompt:
-                instructions_part = custom_prompt.split("USER INSTRUCTIONS:")[1].strip()
-                custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
-            elif "INSTRUCTIONS:" in custom_prompt:
-                instructions_part = custom_prompt.split("INSTRUCTIONS:")[1].strip()
-                custom_section = f"\n\nUser-provided instructions: {instructions_part}\n"
-            else:
-                # Strip custom prompt to essentials
-                stripped_prompt = custom_prompt.replace("This is a", "").replace("It appears to be a", "")
-                custom_section = f"\n\nUser-provided instructions: {stripped_prompt}\n"
-        # Combine all sections into complete prompt
-        return generic_section + specific_section + output_section + custom_section
     def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
         """
         Extract structured data using text-only model with detailed historical context prompting
-        and improved error handling
         """
         logger = logging.getLogger("text_processor")
         start_time = time.time()
@@ -1710,10 +1566,68 @@ class StructuredOCR:
             # Fast path: Skip for minimal OCR text
             if not ocr_markdown or len(ocr_markdown.strip()) < 50:
                 logger.info("Minimal OCR text - returning basic result")
                 return {
                     "file_name": filename,
                     "topics": ["Document"],
-                    "languages": ["English"],
                     "ocr_contents": {
                         "raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
                     },
@@ -1734,10 +1648,6 @@ class StructuredOCR:
                     "processing_method": "test_mode"
                 }
-            # Detect document type and build enhanced prompt
-            doc_type = self._detect_document_type(custom_prompt, ocr_markdown)
-            logger.info(f"Detected document type: {doc_type}")
             # If OCR text is very large, truncate it to avoid API limits
             truncated_text = ocr_markdown
             if len(ocr_markdown) > 25000:
@@ -1745,8 +1655,25 @@ class StructuredOCR:
                 truncated_text = ocr_markdown[:15000] + "\n...[content truncated]...\n" + ocr_markdown[-5000:]
                 logger.info(f"OCR text truncated from {len(ocr_markdown)} to {len(truncated_text)} chars")
-            # Build the prompt with truncated text if needed
-            enhanced_prompt = self._build_enhanced_prompt(doc_type, truncated_text, custom_prompt)
             # Use enhanced prompt with text-only model - with retry logic
             max_retries = 2
@@ -1784,40 +1711,14 @@ class StructuredOCR:
                     if 'languages' in result:
                         result['languages'] = [str(lang) for lang in result.get('languages', [])]
-                    # Add processing metadata
                     result['processing_method'] = 'text_model'
-                    result['document_type'] = doc_type
                     result['model_used'] = TEXT_MODEL
                     result['processing_time'] = time.time() - start_time
                     # Flag when custom prompt has been successfully applied
                     if custom_prompt:
                         result['custom_prompt_applied'] = 'text_model'
-                        # Attempt to detect document type from custom prompt
-                        if "DOCUMENT TYPE:" in custom_prompt:
-                            doc_type_line = custom_prompt.split("\n")[0]
-                            if "DOCUMENT TYPE:" in doc_type_line:
-                                custom_doc_type = doc_type_line.split("DOCUMENT TYPE:")[1].strip().lower()
-                                result['detected_document_type'] = custom_doc_type
-                        # Keyword-based detection as fallback
-                        elif any(keyword in custom_prompt.lower() for keyword in ["newspaper", "column", "article", "magazine"]):
-                            result['detected_document_type'] = "newspaper"
-                        elif any(keyword in custom_prompt.lower() for keyword in ["letter", "correspondence", "handwritten"]):
-                            result['detected_document_type'] = "letter"
-                        elif any(keyword in custom_prompt.lower() for keyword in ["book", "publication"]):
-                            result['detected_document_type'] = "book"
-                        elif any(keyword in custom_prompt.lower() for keyword in ["form", "certificate", "legal"]):
-                            result['detected_document_type'] = "form"
-                        elif any(keyword in custom_prompt.lower() for keyword in ["recipe", "ingredients"]):
-                            result['detected_document_type'] = "recipe"
-                        elif "this is a" in custom_prompt.lower():
-                            # Extract document type from "This is a [type]" format
-                            this_is_parts = custom_prompt.lower().split("this is a ")
-                            if len(this_is_parts) > 1:
-                                extracted_type = this_is_parts[1].split(".")[0].strip()
-                                if extracted_type:
-                                    result['detected_document_type'] = extracted_type
                     # Add raw text for reference if not already present
                     if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
@@ -1880,18 +1781,7 @@ class StructuredOCR:
                     "processing_time": time.time() - start_time
                 }
-                # Try to extract some basic metadata even without AI
-                if ocr_markdown:
-                    # Simple content analysis
-                    text_sample = ocr_markdown[:5000].lower()
-                    # Try to detect language
-                    if "dear" in text_sample and any(word in text_sample for word in ["sincerely", "regards", "truly"]):
-                        result["topics"].append("Letter")
-                    elif any(word in text_sample for word in ["recipe", "ingredients", "instructions", "cook", "bake"]):
-                        result["topics"].append("Recipe")
-                    elif any(word in text_sample for word in ["article", "report", "study", "analysis"]):
-                        result["topics"].append("Article")
             except Exception as inner_e:
                 logger.error(f"Error creating basic result: {str(inner_e)}")
@@ -1919,4 +1809,4 @@ if __name__ == "__main__":
     processor = StructuredOCR()
     result = processor.process_file(file_path)
-    print(json.dumps(result, indent=2))

+# Standard library imports
 import os
 import sys
 import time
 import random
 import json
 import base64
 import logging
+from enum import Enum
+from pathlib import Path
 from functools import lru_cache
 from typing import Optional, Dict, Any, List, Union, Tuple
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Third-party imports
+from pydantic import BaseModel
 # Try to import pycountry, provide fallback if not available
 try:
     import pycountry
     PYCOUNTRY_AVAILABLE = True
 except ImportError:
     PYCOUNTRY_AVAILABLE = False
+    logger.warning("pycountry module not available - using language code fallback")
 # Try to import Mistral AI, provide fallback if not available
 try:
     MISTRAL_AVAILABLE = True
 except ImportError:
     MISTRAL_AVAILABLE = False
+    logger.warning("mistralai module not available - OCR functionality will be limited")
 # Import utilities for OCR processing
 try:
         if file_type is None:
             suffix = file_path.suffix.lower()
             file_type = "pdf" if suffix == ".pdf" else "image"
+        # Check for handwritten document by filename
+        filename_lower = file_path.name.lower()
+        if "handwritten" in filename_lower or "manuscript" in filename_lower or "letter" in filename_lower:
+            logger.info(f"Detected likely handwritten document from filename: {file_path.name}")
+            # This will be used during processing to apply handwritten-specific handling
         # Get file size if not provided
         if file_size_mb is None and file_path.exists():
                     # Convert only the selected pages to minimize memory usage
                     selected_images = []
                     combined_text = []
+                    detected_languages = set()  # Track detected languages across all pages
                     # Process pages in larger batches for better efficiency
                     batch_size = 5  # Process 5 pages at a time for better throughput
                                             # Add page text to combined text without obvious page markers
                                             page_text = page_result['ocr_contents']['raw_text']
                                             combined_text.append(f"{page_text}")
+                                            # Collect detected languages from each page
+                                            if 'languages' in page_result:
+                                                for lang in page_result['languages']:
+                                                    detected_languages.add(lang)
                                     except Exception as page_e:
                                         logger.warning(f"Error processing page {page_num}: {str(page_e)}")
                                     # Clean up temp file
                                 # Add flag to indicate custom prompt was applied
                                 result['custom_prompt_applied'] = 'text_only'
+                                # Simplified approach - no document type detection
                             except Exception as e:
                                 logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
                             if 'ocr_contents' in result:
                                 result['ocr_contents']['raw_text'] = all_text
+                        # Merge detected languages if available
+                        if detected_languages:
+                            result['languages'] = list(detected_languages)
                         # Add PDF metadata
                         result['file_name'] = file_path.name
                         result['pdf_processing_method'] = 'pdf2image_optimized'
                     limited_pages = True
                     logger.info(f"Processing {len(pages_to_process)} pages out of {total_pages} total")
+                # Directly extract any language information from the OCR response
+                detected_languages = set()
+                # Check if the response has a 'languages' attribute in any form
+                # First check direct attributes on the response object
+                if hasattr(pdf_response, 'languages') and pdf_response.languages:
+                    for lang in pdf_response.languages:
+                        detected_languages.add(str(lang))
+                        logger.info(f"Found language in OCR response: {lang}")
+                # Then check if it's in the response as a dictionary format
+                elif hasattr(pdf_response, '__dict__'):
+                    response_dict = pdf_response.__dict__
+                    if 'languages' in response_dict and response_dict['languages']:
+                        for lang in response_dict['languages']:
+                            detected_languages.add(str(lang))
+                            logger.info(f"Found language in OCR response dict: {lang}")
                 # Calculate confidence score if available
                 try:
                     confidence_values = [page.confidence for page in pages_to_process if hasattr(page, 'confidence')]
                     if page_markdown.strip():
                         all_markdown.append(f"{page_markdown}")
+                    # Collect language information from individual pages if available
+                    if hasattr(page, 'languages') and page.languages:
+                        for lang in page.languages:
+                            detected_languages.add(str(lang))
+                            logger.info(f"Found language in page {page_num}: {lang}")
                 # Join all pages with separation
                 combined_markdown = "\n\n".join(all_markdown)
                         combined_markdown, file_path.name, custom_prompt
                     )
+                # If we have detected languages directly from the OCR model, use them
+                if detected_languages:
+                    logger.info(f"Using languages detected by OCR model: {', '.join(detected_languages)}")
+                    result['languages'] = list(detected_languages)
+                    # Add flag to indicate source of language detection
+                    result['language_detection_source'] = 'mistral-ocr-latest'
                 # Add metadata about pages
                 if limited_pages:
                     result['limited_pages'] = {
                 "confidence_score": 0.0
             }
+        # Check if this is likely a newspaper or handwritten document by filename
         is_likely_newspaper = False
+        is_likely_handwritten = False
         newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
                             "chronicle", "post", "tribune", "news", "press", "gender"]
+        handwritten_keywords = ["handwritten", "manuscript", "letter", "correspondence", "journal", "diary"]
+        # Check filename for document type indicators
         filename_lower = file_path.name.lower()
+        # First check for handwritten documents
+        for keyword in handwritten_keywords:
             if keyword in filename_lower:
+                is_likely_handwritten = True
+                logger.info(f"Likely handwritten document detected from filename: {file_path.name}")
+                # Add handwritten-specific processing hint to custom_prompt if not already present
                 if custom_prompt:
+                    if "handwritten" not in custom_prompt.lower():
+                        custom_prompt = custom_prompt + " This appears to be a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations."
                 else:
+                    custom_prompt = "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations."
                 break
+        # Then check for newspaper if not handwritten
+        if not is_likely_handwritten:
+            for keyword in newspaper_keywords:
+                if keyword in filename_lower:
+                    is_likely_newspaper = True
+                    logger.info(f"Likely newspaper document detected from filename: {file_path.name}")
+                    # Add newspaper-specific processing hint to custom_prompt if not already present
+                    if custom_prompt:
+                        if "column" not in custom_prompt.lower() and "newspaper" not in custom_prompt.lower():
+                            custom_prompt = custom_prompt + " This appears to be a newspaper or document with columns. Please extract all text content from each column."
+                    else:
+                        custom_prompt = "This appears to be a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
+                    break
         try:
             # Check file size
                             logger.info(f"Resized image to {new_size_mb:.2f} MB")
                     except ImportError:
                         logger.warning("PIL not available for resizing. Using original image.")
+                        # Use enhanced encoder with proper MIME type detection
+                        from ocr_utils import encode_image_for_api
+                        base64_data_url = encode_image_for_api(file_path)
                     except Exception as e:
                         logger.warning(f"Image resize failed: {str(e)}. Using original image.")
+                        # Use enhanced encoder with proper MIME type detection
+                        from ocr_utils import encode_image_for_api
+                        base64_data_url = encode_image_for_api(file_path)
                 else:
+                    # For smaller images, use as-is with proper MIME type
+                    from ocr_utils import encode_image_for_api
+                    base64_data_url = encode_image_for_api(file_path)
             except Exception as e:
                 # Fallback to original image if any preprocessing fails
                 logger.warning(f"Image preprocessing failed: {str(e)}. Using original image.")
+                # Use enhanced encoder with proper MIME type detection
+                from ocr_utils import encode_image_for_api
+                base64_data_url = encode_image_for_api(file_path)
             # Process the image with OCR
             logger.info(f"Processing image with OCR using {OCR_MODEL}")
             # Get the OCR markdown from the first page
             image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
+            # Check if the OCR response has images
+            has_images = hasattr(image_response, 'pages') and image_response.pages and hasattr(image_response.pages[0], 'images') and image_response.pages[0].images
+            # Check for language information directly from the OCR model
+            detected_languages = set()
+            # Check if the response has a 'languages' attribute in any form
+            # First check direct attributes on the response object
+            if hasattr(image_response, 'languages') and image_response.languages:
+                for lang in image_response.languages:
+                    detected_languages.add(str(lang))
+                    logger.info(f"Found language in OCR response: {lang}")
+            # Then check if it's in the response as a dictionary format
+            elif hasattr(image_response, '__dict__'):
+                response_dict = image_response.__dict__
+                if 'languages' in response_dict and response_dict['languages']:
+                    for lang in response_dict['languages']:
+                        detected_languages.add(str(lang))
+                        logger.info(f"Found language in OCR response dict: {lang}")
+            # Check for languages in individual pages
+            if hasattr(image_response, 'pages') and image_response.pages:
+                for page in image_response.pages:
+                    if hasattr(page, 'languages') and page.languages:
+                        for lang in page.languages:
+                            detected_languages.add(str(lang))
+                            logger.info(f"Found language in page: {lang}")
             # Optimize: Skip vision model step if ocr_markdown is very small or empty
             # BUT make an exception for newspapers or if custom_prompt is provided
+            # OR if the image has visual content worth preserving
+            if (not is_likely_newspaper and not custom_prompt and not has_images) and (not image_ocr_markdown or len(image_ocr_markdown) < 50):
+                logger.warning("OCR produced minimal text with no images. Returning basic result.")
                 return {
                     "file_name": file_path.name,
                     "topics": ["Document"],
                     "ocr_contents": {
                         "raw_text": image_ocr_markdown if image_ocr_markdown else "No text could be extracted from the image."
                     },
+                    "processing_note": "OCR produced minimal text content",
+                    # Include raw response data for images
+                    "raw_response_data": serialize_ocr_response(image_response)
                 }
             # For newspapers with little text in OCR, set a more explicit prompt
                     custom_prompt = "This is a newspaper or document with columns. The OCR may not have captured all text. Please examine the image carefully and extract ALL text content visible in the document, reading each column from top to bottom."
                 elif "extract all text" not in custom_prompt.lower():
                     custom_prompt += " Please examine the image carefully and extract ALL text content visible in the document."
+            # For images with minimal text but visual content, enhance the prompt
+            elif has_images and (not image_ocr_markdown or len(image_ocr_markdown) < 100):
+                logger.info("Document with images but minimal text detected. Using enhanced prompt for mixed media.")
+                if not custom_prompt:
+                    custom_prompt = "This is a mixed media document with both text and important visual elements. Please carefully describe the image content and extract all visible text, preserving the relationship between text and visuals."
+                elif "visual" not in custom_prompt.lower() and "image" not in custom_prompt.lower():
+                    custom_prompt += " The document contains important visual elements that should be described along with the text content."
             # Extract structured data using the appropriate model, with a single API call
             if use_vision:
                 logger.info(f"Using text-only model: {TEXT_MODEL}")
                 result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name, custom_prompt)
+            # If we have detected languages directly from the OCR model, use them
+            if detected_languages:
+                logger.info(f"Using languages detected by OCR model: {', '.join(detected_languages)}")
+                result['languages'] = list(detected_languages)
+                # Add flag to indicate source of language detection
+                result['language_detection_source'] = 'mistral-ocr-latest'
             # Store the serialized OCR response for image rendering (for compatibility with original version)
             # Don't store raw_response directly as it's not JSON serializable
             serialized_response = serialize_ocr_response(image_response)
             # Store key parts of the OCR response for image rendering
             # With serialized format that can be stored in JSON
             result['has_images'] = has_images
             if has_images:
                 logger.info("Test mode or no API key, using text-only processing")
                 return self._extract_structured_data_text_only(ocr_markdown, filename)
             # Use only the first part of OCR text to keep prompts small and processing fast
             if len(ocr_markdown) > 1000:
                 truncated_ocr = ocr_markdown[:1000]
             else:
                 truncated_ocr = ocr_markdown
+            # Build a comprehensive prompt with OCR text and detailed instructions for language detection and image handling
+            enhanced_prompt = f"This is a document's OCR text:\n<BEGIN_OCR>\n{truncated_ocr}\n<END_OCR>\n\n"
+            # Add custom prompt if provided
+            if custom_prompt:
+                enhanced_prompt += f"User instructions: {custom_prompt}\n\n"
+            # Add comprehensive extraction instructions with language detection guidance
+            enhanced_prompt += "Extract all text content accurately from this document, including any text visible in the image that may not have been captured by OCR.\n\n"
+            enhanced_prompt += "IMPORTANT: Precisely identify and list ALL languages present in the document separately. Look closely for multiple languages that might appear together.\n"
+            enhanced_prompt += "For language detection, examine these specific indicators:\n"
+            enhanced_prompt += "- Portuguese: accents (ã, õ, á, é, ê, ó, ç), words like 'e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com'\n"
+            enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con'\n"
+            enhanced_prompt += "- French: accents (é, è, ê, à, ç), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du'\n"
+            enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
+            enhanced_prompt += "- Italian: accents (à, è, é, ì, ò, ù), words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
+            enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
+            enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n\n"
+            enhanced_prompt += "If the document contains multiple columns or sections, process each section independently and then combine them logically.\n"
+            enhanced_prompt += "Return ALL detected languages as separate entries in the languages array, never combine them."
             # Measure API call time for optimization feedback
             start_time = time.time()
                 # Use a fixed, shorter timeout for single-page documents
                 timeout_ms = 45000  # 45 seconds is optimal for most single-page documents
+                logger.info(f"Calling vision model with {timeout_ms}ms timeout")
                 chat_response = self.client.chat.parse(
                     model=VISION_MODEL,
                     messages=[
                 )
                 api_time = time.time() - start_time
+                logger.info(f"Vision model completed in {api_time:.2f}s")
             except Exception as e:
                 # If there's an error with the enhanced prompt, try progressively simpler approaches
             if 'languages' in result:
                 result['languages'] = [str(lang) for lang in result.get('languages', [])]
+            # Add simplified metadata about processing
             result['processing_info'] = {
                 'method': 'vision_model',
                 'ocr_text_length': len(ocr_markdown),
                 'api_response_time': time.time() - start_time
             }
+            # Note if custom prompt was applied
             if custom_prompt:
                 result['custom_prompt_applied'] = 'vision_model'
             # Add confidence score if not present
             if 'confidence_score' not in result:
         return result
+    # We've removed document type detection entirely for simplicity
+        # Create a prompt with enhanced language detection instructions
         generic_section = (
+            f"You are an OCR specialist processing historical documents. "
+            f"Focus on accurately extracting text content while preserving structure and formatting. "
+            f"Pay attention to any historical features and document characteristics.\n\n"
+            f"IMPORTANT: Accurately identify the document's language(s). Look for language-specific characters, words, and phrases. "
+            f"Specifically check for French (accents like é, è, ç, words like 'le', 'la', 'et', 'est'), German (umlauts, words like 'und', 'der', 'das'), "
+            f"Latin, and other non-English languages. Carefully analyze the text before determining language.\n\n"
+            f"Create a structured JSON response with the following fields:\n"
+            f"- file_name: The document's name\n"
+            f"- topics: An array of topics covered in the document\n"
+            f"- languages: An array of languages used in the document (be precise and specific about language detection)\n"
+            f"- ocr_contents: A comprehensive dictionary with the document's contents including:\n"
+            f"  * title: The main title or heading (if present)\n"
+            f"  * content: The main body content\n"
+            f"  * raw_text: The complete OCR text\n"
         )
         # Add custom prompt if provided
         custom_section = ""
         if custom_prompt:
+            custom_section = f"\n\nUser-provided instructions: {custom_prompt}\n"
+        # Return the enhanced prompt
+        return generic_section + custom_section
     def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
         """
         Extract structured data using text-only model with detailed historical context prompting
+        and improved error handling with enhanced language detection
         """
         logger = logging.getLogger("text_processor")
         start_time = time.time()
             # Fast path: Skip for minimal OCR text
             if not ocr_markdown or len(ocr_markdown.strip()) < 50:
                 logger.info("Minimal OCR text - returning basic result")
+                # Attempt comprehensive language detection even for minimal text
+                detected_languages = []
+                # Simple language detection based on character frequency
+                if ocr_markdown and len(ocr_markdown.strip()) > 10:
+                    # Define indicators for all supported languages
+                    language_indicators = {
+                        "Portuguese": {
+                            "chars": ['ã', 'õ', 'á', 'é', 'ê', 'í', 'ó', 'ú', 'ç'],
+                            "words": ['e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com']
+                        },
+                        "Spanish": {
+                            "chars": ['ñ', 'á', 'é', 'í', 'ó', 'ú', '¿', '¡'],
+                            "words": ['el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con', 'del']
+                        },
+                        "French": {
+                            "chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û'],
+                            "words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une']
+                        },
+                        "German": {
+                            "chars": ['ä', 'ö', 'ü', 'ß'],
+                            "words": ['der', 'die', 'das', 'und', 'ist', 'von', 'mit', 'für', 'sich']
+                        },
+                        "Italian": {
+                            "chars": ['à', 'è', 'é', 'ì', 'ò', 'ù'],
+                            "words": ['il', 'la', 'e', 'di', 'che', 'per', 'con', 'sono', 'non']
+                        },
+                        "Latin": {
+                            "chars": [],
+                            "words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod']
+                        }
+                    }
+                    words = ocr_markdown.lower().split()
+                    # Check for indicators of each language
+                    for language, indicators in language_indicators.items():
+                        chars = indicators["chars"]
+                        lang_words = indicators["words"]
+                        has_chars = any(char in ocr_markdown for char in chars) if chars else False
+                        word_count = sum(1 for word in words if word in lang_words)
+                        # Add language if strong enough indicators are present
+                        if has_chars or word_count >= 2:
+                            detected_languages.append(language)
+                    # Check for English separately
+                    english_words = ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it']
+                    english_count = sum(1 for word in words if word in english_words)
+                    if english_count >= 2:
+                        detected_languages.append("English")
+                # If no languages detected, default to English
+                if not detected_languages:
+                    detected_languages = ["English"]
                 return {
                     "file_name": filename,
                     "topics": ["Document"],
+                    "languages": detected_languages,
                     "ocr_contents": {
                         "raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
                     },
                     "processing_method": "test_mode"
                 }
             # If OCR text is very large, truncate it to avoid API limits
             truncated_text = ocr_markdown
             if len(ocr_markdown) > 25000:
                 truncated_text = ocr_markdown[:15000] + "\n...[content truncated]...\n" + ocr_markdown[-5000:]
                 logger.info(f"OCR text truncated from {len(ocr_markdown)} to {len(truncated_text)} chars")
+            # Build a prompt with enhanced language detection instructions
+            enhanced_prompt = f"This is a document's OCR text:\n<BEGIN_OCR>\n{truncated_text}\n<END_OCR>\n\n"
+            # Add custom prompt if provided
+            if custom_prompt:
+                enhanced_prompt += f"User instructions: {custom_prompt}\n\n"
+            # Add thorough extraction instructions with enhanced language detection and metadata requirements
+            enhanced_prompt += "Extract all text content accurately from this document. Return structured data with the document's contents.\n\n"
+            enhanced_prompt += "IMPORTANT: Precisely identify and list ALL languages present in the document separately. Look closely for multiple languages that might appear together.\n"
+            enhanced_prompt += "For language detection, examine these specific indicators:\n"
+            enhanced_prompt += "- French: accents (é, è, ê, à, ç), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du'\n"
+            enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
+            enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en'\n"
+            enhanced_prompt += "- Italian: words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
+            enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
+            enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n"
+            enhanced_prompt += "Do NOT classify text as English unless you can positively confirm it contains specifically English words and phrases.\n\n"
+            enhanced_prompt += "Return ALL detected languages as separate entries in the languages array. If multiple languages are present, list them ALL separately."
             # Use enhanced prompt with text-only model - with retry logic
             max_retries = 2
                     if 'languages' in result:
                         result['languages'] = [str(lang) for lang in result.get('languages', [])]
+                    # Add simplified processing metadata
                     result['processing_method'] = 'text_model'
                     result['model_used'] = TEXT_MODEL
                     result['processing_time'] = time.time() - start_time
                     # Flag when custom prompt has been successfully applied
                     if custom_prompt:
                         result['custom_prompt_applied'] = 'text_model'
                     # Add raw text for reference if not already present
                     if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
                     "processing_time": time.time() - start_time
                 }
+                # No topic detection to avoid issue with document misclassification
             except Exception as inner_e:
                 logger.error(f"Error creating basic result: {str(inner_e)}")
     processor = StructuredOCR()
     result = processor.process_file(file_path)
+    print(json.dumps(result, indent=2))

ui/layout.py CHANGED Viewed

@@ -1,217 +1,339 @@
 import streamlit as st
 def load_css():
-    """Load custom CSS for the application"""
     st.markdown("""
     <style>
-    /* Global styles */
-    body {
-        font-family: 'Source Sans Pro', sans-serif;
-        color: #333;
     }
-    /* Header styles */
     h1, h2, h3, h4, h5, h6 {
-        font-family: 'Georgia', serif;
         font-weight: 600;
-        color: #1E3A8A;
     }
-    /* Processing status container */
-    .processing-status-container {
-        padding: 10px 15px;
-        border-left: 4px solid #1E88E5;
-        background-color: #E3F2FD;
-        border-radius: 0 4px 4px 0;
-        margin: 10px 0;
-        font-size: 14px;
     }
-    /* Previous results styling */
-    .previous-results-container {
-        margin-top: 20px;
     }
-    .result-card {
-        background-color: #f8f9fa;
-        border-radius: 8px;
-        padding: 15px;
-        margin-bottom: 15px;
-        border: 1px solid #e0e0e0;
-        transition: all 0.2s ease;
     }
-    .result-card:hover {
-        box-shadow: 0 4px 8px rgba(0,0,0,0.1);
-        border-color: #c0c0c0;
     }
-    .result-header {
-        display: flex;
-        justify-content: space-between;
-        margin-bottom: 10px;
     }
-    .result-filename {
-        font-weight: bold;
-        font-size: 16px;
     }
-    .result-date {
-        color: #666;
-        font-size: 14px;
     }
-    .result-metadata {
-        margin-top: 10px;
-        font-size: 14px;
     }
-    .result-tag {
         margin-bottom: 5px;
-        color: #555;
     }
-    .result-action-button {
-        margin-top: 10px;
-        text-align: right;
     }
-    .selected-result-container {
-        margin-top: 30px;
-        padding: 20px;
-        background-color: #f0f2f6;
-        border-radius: 8px;
-        border: 1px solid #d0d7de;
     }
-    .selected-result-title {
-        font-size: 18px;
-        font-weight: bold;
-        color: #1E3A8A;
     }
-    /* Raw text editor styling */
     .stTextArea textarea {
-        font-family: 'Courier New', monospace;
-        font-size: 14px;
-        line-height: 1.5;
     }
-    /* Image and text side-by-side styling */
-    .image-text-container {
-        display: flex;
-        gap: 20px;
-        margin-bottom: 20px;
     }
-    .image-container {
-        flex: 1;
     }
-    .text-container {
-        flex: 1;
     }
-    /* Sidebar styling */
-    .sidebar .stRadio > div {
-        flex-direction: row;
     }
-    .sidebar .stRadio label {
-        margin-right: 10px;
     }
-    /* Optimize spacing in sidebar */
     .sidebar .block-container {
-        padding-top: 0;
     }
-    .sidebar [data-testid="stVerticalBlock"] {
-        gap: 0;
-    }
-    /* Button styling */
-    .stButton > button {
-        border-radius: 4px;
-        font-weight: 600;
     }
-    /* File uploader styling */
-    .stFileUploader > section > div {
-        min-height: 100px;
     }
-    /* Reset vertical text in file uploader */
-    .stFileUploader p,
-    .stFileUploader span,
-    .stFileUploader div p,
-    .stFileUploader div span,
-    .stFileUploader label p,
-    .stFileUploader label span,
-    .stFileUploader div[data-testid="stFileUploadDropzone"] p,
-    .stFileUploader div[data-testid="stFileUploadDropzone"] span {
-        writing-mode: horizontal-tb !important;
     }
-    /* Metadata styling */
-    .metadata-card {
-        background-color: #f8f9fa;
-        border-radius: 8px;
-        padding: 15px;
-        margin-bottom: 20px;
-        border: 1px solid #e0e0e0;
     }
-    /* Document content styling */
-    .document-content {
-        margin-top: 10px;
     }
-    /* Tab styling */
-    .stTabs [data-baseweb="tab-list"] {
-        gap: 8px;
     }
-    .stTabs [data-baseweb="tab"] {
-        padding: 8px 16px;
-        border-radius: 4px 4px 0 0;
     }
-    /* Success message styling */
-    .stSuccess {
-        background-color: #D4EDDA;
-        color: #155724;
-        padding: 10px;
-        border-radius: 4px;
-        border-left: 5px solid #155724;
     }
-    /* Error message styling */
-    .stError {
-        background-color: #F8D7DA;
-        color: #721C24;
-        padding: 10px;
-        border-radius: 4px;
-        border-left: 5px solid #721C24;
     }
-    /* Info message styling */
-    .stInfo {
-        background-color: #D1ECF1;
-        color: #0C5460;
-        padding: 10px;
-        border-radius: 4px;
-        border-left: 5px solid #0C5460;
     }
-    /* Warning message styling */
-    .stWarning {
-        background-color: #FFF3CD;
-        color: #856404;
-        padding: 10px;
-        border-radius: 4px;
-        border-left: 5px solid #856404;
     }
     </style>
     """, unsafe_allow_html=True)

 import streamlit as st
 def load_css():
+    """Load custom CSS for the application - inspired by mistral-ocr implementations"""
     st.markdown("""
     <style>
+    /* Global styles - clean, modern approach with consistent line height */
+    :root {
+        --standard-line-height: 1.5;
     }
+    body {
+        font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
+        color: #111827;
+        line-height: var(--standard-line-height);
+    }
+    /* Remove all container backgrounds that cause the white box issue */
+    div[data-testid="stMarkdownContainer"],
+    div[data-testid="stText"],
+    div.stMarkdown,
+    .stText > div:first-child,
+    .element-container > div,
+    div[data-testid="column"] > div > div > div {
+        background-color: transparent !important;
+        box-shadow: none !important;
+        border: none !important;
+        border-radius: 0 !important;
+        padding: 0 !important;
+        margin: 0 !important;
+    }
+    /* Base text styling with standardized line height */
+    div[data-testid="stMarkdownContainer"] > p {
+        margin: 0 0 0.3rem 0 !important;
+        padding: 0 !important;
+        font-size: 0.95rem !important;
+        line-height: var(--standard-line-height) !important;
+    }
+    /* Move content to top of columns with minimal padding */
+    [data-testid="column"] {
+        align-items: flex-start !important;
+        padding: 0 0.5rem !important;
+        gap: 0.5rem !important;
+    }
+    /* Clean minimal heading styles with better line height */
     h1, h2, h3, h4, h5, h6 {
+        font-family: 'Inter', system-ui, sans-serif;
         font-weight: 600;
+        color: #111827;
+        margin: 0.4rem 0 0.2rem 0 !important;
+        padding: 0 !important;
+        background-color: transparent !important;
+        line-height: 1.3 !important; /* Slightly increased for headings but still compact */
+    }
+    /* Simple section headers with subtle styling */
+    .block-container [data-testid="column"] h4 {
+        font-size: 0.95rem !important;
+        font-weight: 600 !important;
+        color: #374151 !important;
+        border-bottom: 1px solid #e5e7eb;
+        padding-bottom: 0.15rem !important;
+        margin-bottom: 0.25rem !important;
+    }
+    /* Reduce whitespace between elements */
+    .element-container {
+        margin-bottom: 0.2rem !important;
+    }
+    /* OCR text container with improved contrast and styling */
+    .ocr-text-container {
+        font-family: 'Inter', system-ui, sans-serif;
+        font-size: 0.95rem;
+        line-height: var(--standard-line-height); /* Consistent line height */
+        color: #111827;
+        margin-bottom: 0.4rem;
+        max-height: 600px;
+        overflow-y: auto;
+        background-color: transparent;
+        padding: 6px 10px;
+        border-radius: 4px;
+        border: 1px solid #e2e8f0;
     }
+    /* Custom scrollbar styling */
+    .ocr-text-container::-webkit-scrollbar {
+        width: 6px;
+        height: 6px;
     }
+    .ocr-text-container::-webkit-scrollbar-track {
+        background: #f1f1f1;
+        border-radius: 3px;
     }
+    .ocr-text-container::-webkit-scrollbar-thumb {
+        background: #c1c1c1;
+        border-radius: 3px;
     }
+    .ocr-text-container::-webkit-scrollbar-thumb:hover {
+        background: #a0a0a0;
     }
+    /* Styling for all expanders/accordions */
+    .st-expander,
+    details.streamlit-expanderHeader {
+        border: 1px solid #e5e7eb !important;
+        border-radius: 4px !important;
+        box-shadow: none !important;
+        background-color: transparent !important;
+        margin-bottom: 6px !important;
     }
+    .st-expanderHeader,
+    summary.streamlit-expanderHeader {
+        font-size: 0.95rem !important;
+        font-weight: 600 !important;
+        color: #374151 !important;
+        padding: 0.4rem 0.6rem !important;
+        background-color: rgba(241, 245, 249, 0.5) !important;
+        border-bottom: 1px solid #e5e7eb !important;
+        border-radius: 3px 3px 0 0 !important;
     }
+    .st-expanderContent,
+    details[open] > div:nth-child(2) {
+        border-top: none !important;
+        padding: 0.4rem 0.6rem !important;
+        background-color: transparent !important;
     }
+    /* Set expander text to have good contrast */
+    .st-expanderContent p,
+    .st-expanderContent li,
+    .st-expanderContent span {
+        color: #1f2937 !important;
     }
+    /* Streamlined OCR image display */
+    .ocr-image-container {
+        border: 1px solid #e2e8f0;
+        border-radius: 4px;
+        padding: 0;
+        background-color: transparent;
         margin-bottom: 5px;
     }
+    .ocr-image-container img {
+        border-radius: 4px;
+        width: 100%;
     }
+    /* Subtle document sections */
+    .document-section {
+        margin-bottom: 0.4rem !important;
     }
+    /* Compact tag styling */
+    .subject-tag {
+        display: inline-block;
+        padding: 0.1rem 0.4rem;
+        border-radius: 3px;
+        font-size: 0.7rem;
+        margin: 0 0.2rem 0.2rem 0;
+        background-color: #f3f4f6;
+        color: #374151;
+        border: 1px solid #e5e7eb;
     }
+    .tag-time-period { color: #1e40af; background-color: #eff6ff; border-color: #bfdbfe; }
+    .tag-language { color: #065f46; background-color: #ecfdf5; border-color: #a7f3d0; }
+    .tag-document-type { color: #5b21b6; background-color: #f5f3ff; border-color: #ddd6fe; }
+    .tag-subject { color: #166534; background-color: #f0fdf4; border-color: #bbf7d0; }
+    /* Clean text area */
     .stTextArea textarea {
+        font-family: 'Roboto Mono', monospace;
+        font-size: 0.9rem;
+        line-height: var(--standard-line-height); /* Consistent line height */
+        padding: 0.5rem;
     }
+    /* Button styling - fixed for text overflow issues */
+    .stButton > button {
+        border-radius: 4px;
+        font-weight: 400;
+        line-height: var(--standard-line-height);
+        padding: 0.4rem 0.75rem !important;
+        margin: 0;
+        min-width: 150px !important; /* Increased minimum width */
+        white-space: normal !important; /* Allow text to wrap if needed */
+        overflow: visible !important; /* Ensure text doesn't get cut off */
+        height: auto !important; /* Allow height to adjust as needed */
+        text-overflow: clip !important; /* Don't clip text */
+        display: inline-block !important; /* Better content handling */
+        text-align: center !important; /* Center text */
+    }
+    /* Fix button text alignment and prevent truncation */
+    .stButton > button > div,
+    .stButton > button span,
+    .stButton > button p {
+        display: inline-block !important;
+        align-items: center;
+        white-space: normal !important;
+        overflow: visible !important;
+        width: auto !important;
+        text-overflow: clip !important;
+        word-wrap: normal !important;
+    }
+    /* Fix for all action buttons in the application */
+    [data-testid="stHorizontalBlock"] button,
+    button[key="close_document_btn"],
+    button[key="process_document_btn"],
+    button[key="load_sample_btn"],
+    button[key="view_btn"],
+    .stDownloadButton button,
+    button[key*="copy_btn"],
+    button[key*="download_btn"],
+    button[key*="view_"] {
+        width: auto !important;
+        min-width: 150px !important;
+        max-width: none !important;
+        display: inline-block !important;
+        white-space: normal !important;
+        overflow: visible !important;
+        text-align: center !important;
+        text-overflow: clip !important;
+        word-break: normal !important;
+        padding: 0.4rem 0.75rem !important;
     }
+    /* Ensure text doesn't wrap awkwardly for buttons */
+    button span p {
+        margin: 0 !important;
+        padding: 0 !important;
+        white-space: normal !important;
+        overflow: visible !important;
     }
+    /* Extra button container fixes for all button types */
+    .stButton, .stDownloadButton, [data-testid="stDownloadButton"] {
+        width: auto !important;
+        min-width: 150px !important;
+        overflow: visible !important;
+        display: block !important;
+        background-color: white;
+        border: 1px solid #ddd;
+        box-shadow: none !important;
     }
+    /* Ensure consistent spacing in widgets */
+    .row-widget {
+        padding: 0.15rem 0 !important;
     }
+    /* Fix spacing in expanders */
+    .stExpander > .streamlit-expanderContent > div {
+        padding-top: 0.15rem !important;
     }
+    /* Optimized sidebar */
     .sidebar .block-container {
+        padding-top: 0.6rem;
     }
+    .sidebar .stRadio > div {
+        flex-direction: row;
     }
+    .sidebar .stRadio label {
+        margin-right: 0.75rem;
+        font-size: 0.9rem;
     }
+    /* Clean alert styles */
+    .stSuccess, .stError, .stInfo, .stWarning {
+        border-radius: 4px;
+        padding: 0.3rem 0.6rem;
+        margin: 0.2rem 0;
     }
+    /* Fix any remaining spacing issues */
+    div.element-container > div > div {
+        margin: 0 !important;
+        line-height: var(--standard-line-height); /* Ensure consistent line height */
     }
+    /* Fix column layouts for button containers */
+    [data-testid="column"] > div:has(.stButton) {
+        display: flex;
+        justify-content: flex-start;
+        align-items: center;
+        min-height: 38px; /* Match standard button height */
     }
+    /* Fix for tabs being cut off at the top of the page */
+    /* Main container adjustments to avoid header overlap */
+    .main .block-container {
+        padding-top: 3rem !important; /* Increased top padding to make room for Streamlit header */
     }
+    [data-testid="stTabs"] {
+        width: 100%;
+        overflow-x: visible !important;
+        position: relative;
+        z-index: 1; /* Ensure tabs are on the right layer */
     }
+    [data-testid="stTabs"] > div:first-child {
+        padding-left: 0.5rem;
+        padding-right: 0.5rem;
+        overflow-x: visible !important;
     }
+    [data-testid="stTabs"] [role="tab"] {
+        padding: 0.5rem 1rem;
+        min-width: fit-content;
+        white-space: nowrap;
     }
+    [data-testid="stTabs"] [role="tablist"] {
+        overflow-x: visible !important;
+        flex-wrap: nowrap;
+        margin-top: 1rem; /* Add a bit more space at the top */
     }
+    /* Fix header overlap issues */
+    header[data-testid="stHeader"] {
+        z-index: 999 !important; /* Keep header on top */
     }
     </style>
     """, unsafe_allow_html=True)

ui_components.py CHANGED Viewed

@@ -2,6 +2,7 @@ import streamlit as st
 import os
 import io
 import base64
 from datetime import datetime
 from pathlib import Path
 import json
@@ -64,16 +65,16 @@ class ProgressReporter:
 def create_sidebar_options():
     """Create and return sidebar options"""
     with st.sidebar:
-        st.title("OCR Settings")
         # Create a container for the sidebar options
         with st.container():
             # Model selection
-            st.subheader("Model Selection")
             use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
             # Document type selection
-            st.subheader("Document Type")
             doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
                                    help="Select the type of document you're processing for better results")
@@ -100,7 +101,7 @@ def create_sidebar_options():
                 st.markdown("**Custom Processing Instructions**")
                 custom_prompt = st.text_area("", value=custom_prompt,
                                            help="Customize the instructions for processing this document",
-                                           height=100)
             # Image preprocessing options in an expandable section
             with st.expander("Image Preprocessing"):
@@ -131,8 +132,17 @@ def create_sidebar_options():
                                    help="Rotate image if needed")
             # Create preprocessing options dictionary
             preprocessing_options = {
-                "document_type": "standard",  # Use standard as default, removed duplicate option
                 "grayscale": grayscale,
                 "denoise": denoise,
                 "contrast": contrast,
@@ -141,23 +151,15 @@ def create_sidebar_options():
             # PDF-specific options in an expandable section
             with st.expander("PDF Options"):
-                pdf_dpi = st.slider("PDF Resolution (DPI)",
-                                   min_value=MIN_PDF_DPI,
-                                   max_value=MAX_PDF_DPI,
-                                   value=DEFAULT_PDF_DPI,
-                                   step=25,
-                                   help="Higher DPI gives better quality but slower processing")
                 max_pages = st.number_input("Maximum Pages to Process",
                                           min_value=1,
                                           max_value=20,
                                           value=DEFAULT_MAX_PAGES,
                                           help="Limit the number of pages to process (for multi-page PDFs)")
-                pdf_rotation = st.radio("PDF Rotation", ROTATION_OPTIONS,
-                                      horizontal=True,
-                                      format_func=lambda x: f"{x}°",
-                                      help="Rotate PDF pages if needed")
             # Create options dictionary
             options = {
@@ -175,28 +177,23 @@ def create_sidebar_options():
 def create_file_uploader():
     """Create and return a file uploader"""
     # Add app description
-    favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
-    favicon_base64 = get_base64_from_image(favicon_path)
-    st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><img src="data:image/png;base64,{favicon_base64}" width="36" height="36" alt="Scroll Icon"/> <div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical Document OCR</h2></div></div>', unsafe_allow_html=True)
-    st.markdown("<p style='font-size: 0.8em; color: #666; text-align: right;'>Made possible by Mistral AI</p>", unsafe_allow_html=True)
     # Add project framing
     st.markdown("""
-    This tool is designed to assist scholars in historical research by extracting text from challenging documents.
-    While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
-    historical documents, particularly:
-    - **Historical newspapers** with complex layouts and aged text
-    - **Handwritten documents** from various time periods
-    - **Photos of archival materials** that may be difficult to read
-    Upload a document to get started, or explore the example documents.
     """)
-    # Create file uploader
     uploaded_file = st.file_uploader(
-        "Upload a document",
-        type=["pdf", "png", "jpg", "jpeg"],
         help="Upload a PDF or image file for OCR processing"
     )
     return uploaded_file
@@ -204,136 +201,407 @@ def create_file_uploader():
 def display_results(result, container, custom_prompt=""):
     """Display OCR results in the provided container"""
     with container:
-        # Display document metadata
-        st.subheader("Document Metadata")
-        # Create columns for metadata
-        meta_col1, meta_col2 = st.columns(2)
-        with meta_col1:
-            # Display document type and languages
-            if 'detected_document_type' in result:
-                st.write(f"**Document Type:** {result['detected_document_type']}")
-            if 'languages' in result:
-                languages = [lang for lang in result['languages'] if lang is not None]
-                if languages:
-                    st.write(f"**Languages:** {', '.join(languages)}")
-        with meta_col2:
-            # Display processing time
-            if 'processing_time' in result:
-                st.write(f"**Processing Time:** {result['processing_time']:.1f}s")
-            # Display page information for PDFs
-            if 'limited_pages' in result:
-                st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
-        # Display subject tags if available
         if 'topics' in result and result['topics']:
-            st.write("**Subject Tags:**")
-            # Create a container with flex display for the tags
-            st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
-            # Generate a badge for each tag
-            for topic in result['topics']:
-                # Create colored badge based on tag category
-                badge_color = "#546e7a"  # Default color
-                # Assign colors by category
-                if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
-                    badge_color = "#1565c0"  # Blue for time periods
-                elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
-                    badge_color = "#00695c"  # Teal for languages
-                elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
-                    badge_color = "#6a1b9a"  # Purple for document types
-                elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
-                    badge_color = "#2e7d32"  # Green for subject domains
-                elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
-                    badge_color = "#e65100"  # Orange for preprocessing-related tags
-                st.markdown(
-                    f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
-                    f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
-                    unsafe_allow_html=True
-                )
-            # Close the container
-            st.markdown('</div>', unsafe_allow_html=True)
-        # Display OCR content
-        st.subheader("OCR Content")
-        # Check if we have OCR content
-        if 'ocr_contents' in result:
-            # Create tabs for different views
-            has_images = result.get('has_images', False)
-            if has_images:
-                content_tab1, content_tab2, content_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
-            else:
-                content_tab1, content_tab2 = st.tabs(["Structured View", "Raw Text"])
             with content_tab1:
-                # Display structured content
                 if isinstance(result['ocr_contents'], dict):
-                    for section, content in result['ocr_contents'].items():
-                        if content and section not in ['error', 'raw_text', 'partial_text']:  # Skip error and raw text sections
-                            st.markdown(f"#### {section.replace('_', ' ').title()}")
                             if isinstance(content, str):
-                                st.write(content)
                             elif isinstance(content, list):
                                 for item in content:
                                     if isinstance(item, str):
-                                        st.write(f"- {item}")
                                     else:
-                                        st.write(f"- {str(item)}")
                             elif isinstance(content, dict):
                                 for k, v in content.items():
-                                    st.write(f"**{k}:** {v}")
-            with content_tab2:
-                # Display raw text with editing capability
-                raw_text = ""
-                if 'raw_text' in result['ocr_contents']:
-                    raw_text = result['ocr_contents']['raw_text']
-                elif 'content' in result['ocr_contents']:
-                    raw_text = result['ocr_contents']['content']
-                # Allow editing of the raw text
-                edited_text = st.text_area("Edit Raw Text", raw_text, height=400)
-                # Add a button to copy the edited text to clipboard
-                if st.button("Copy to Clipboard"):
-                    st.success("Text copied to clipboard! (You can paste it elsewhere)")
-                    # Note: The actual clipboard functionality is handled by the browser
-                # Add a download button for the edited text
-                st.download_button(
-                    label="Download Edited Text",
-                    data=edited_text,
-                    file_name=f"{result.get('file_name', 'document').split('.')[0]}_edited.txt",
-                    mime="text/plain"
-                )
-            if has_images and 'pages_data' in result:
-                with content_tab3:
-                    # Use the display_document_with_images function
-                    display_document_with_images(result)
-        # Display custom prompt if provided
-        if custom_prompt:
-            with st.expander("Custom Processing Instructions"):
-                st.write(custom_prompt)
-        # Add download buttons
-        st.subheader("Download Results")
-        # Create columns for download buttons
-        download_col1, download_col2 = st.columns(2)
-        with download_col1:
-            # JSON download
             try:
                 json_str = json.dumps(result, indent=2)
                 st.download_button(
@@ -344,8 +612,7 @@ def display_results(result, container, custom_prompt=""):
                 )
             except Exception as e:
                 st.error(f"Error creating JSON download: {str(e)}")
-        with download_col2:
             # Text download
             try:
                 if 'ocr_contents' in result:
@@ -369,314 +636,319 @@ def display_results(result, container, custom_prompt=""):
 def display_document_with_images(result):
     """Display document with images"""
-    if 'pages_data' not in result:
         st.info("No image data available.")
         return
     # Display each page
-    for i, page_data in enumerate(result['pages_data']):
         st.markdown(f"### Page {i+1}")
         # Create columns for image and text
         img_col, text_col = st.columns([1, 1])
         with img_col:
-            # Display the image
             if 'image_data' in page_data:
                 try:
                     # Convert base64 to image
                     image_data = base64.b64decode(page_data['image_data'])
                     st.image(io.BytesIO(image_data), use_container_width=True)
                 except Exception as e:
-                    st.error(f"Error displaying image: {str(e)}")
-            else:
                 st.info("No image available for this page.")
         with text_col:
-            # Display the text with editing capability
             if 'text' in page_data:
-                edited_text = st.text_area(f"Page {i+1} Text", page_data['text'], height=300, key=f"page_text_{i}")
-                # Add a button to copy the edited text to clipboard
-                if st.button(f"Copy Page {i+1} Text", key=f"copy_btn_{i}"):
-                    st.success(f"Page {i+1} text copied to clipboard!")
             else:
                 st.info("No text available for this page.")
 def display_previous_results():
-    """Display previous results tab content"""
-    st.markdown('<h2>Previous Results</h2>', unsafe_allow_html=True)
-    # Load custom CSS for Previous Results tab
-    try:
-        from ui.layout import load_css
-        load_css()
-    except ImportError:
-        # If ui.layout module is not available, use a simplified version
-        st.markdown("""
-        <style>
-        .previous-results-container {
-            margin-top: 20px;
-        }
-        .result-card {
-            background-color: #f8f9fa;
-            border-radius: 8px;
-            padding: 15px;
-            margin-bottom: 15px;
-            border: 1px solid #e0e0e0;
-        }
-        .result-header {
-            display: flex;
-            justify-content: space-between;
-            margin-bottom: 10px;
-        }
-        .result-filename {
-            font-weight: bold;
-            font-size: 16px;
-        }
-        .result-date {
-            color: #666;
-            font-size: 14px;
-        }
-        .result-metadata {
-            margin-top: 10px;
-            font-size: 14px;
-        }
-        .result-tag {
-            margin-bottom: 5px;
-            color: #555;
-        }
-        .result-action-button {
-            margin-top: 10px;
-            text-align: right;
-        }
-        .selected-result-container {
-            margin-top: 30px;
-            padding: 20px;
-            background-color: #f0f2f6;
-            border-radius: 8px;
-        }
-        .selected-result-title {
-            font-size: 18px;
-            font-weight: bold;
-        }
-        </style>
-        """, unsafe_allow_html=True)
     # Display previous results if available
     if not st.session_state.previous_results:
         st.markdown("""
-        <div class="previous-results-container" style="text-align: center; padding: 40px 20px; background-color: #f0f2f6; border-radius: 8px;">
-            <div style="font-size: 48px; margin-bottom: 20px;">📄</div>
-            <h3 style="margin-bottom: 10px; font-weight: 600;">No Previous Results</h3>
-            <p style="font-size: 16px;">Process a document to see your results history saved here.</p>
         </div>
         """, unsafe_allow_html=True)
     else:
-        # Create a container for the results list
-        st.markdown('<div class="previous-results-container">', unsafe_allow_html=True)
-        st.markdown(f'<h3>{len(st.session_state.previous_results)} Previous Results</h3>', unsafe_allow_html=True)
-        # Create two columns for filters and download buttons
-        filter_col, download_col = st.columns([2, 1])
-        with filter_col:
-            # Add filter options
-            filter_options = ["All Types"]
-            if any(result.get("file_name", "").lower().endswith(".pdf") for result in st.session_state.previous_results):
-                filter_options.append("PDF Documents")
-            if any(result.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png")) for result in st.session_state.previous_results):
-                filter_options.append("Images")
-            selected_filter = st.selectbox("Filter by Type:", filter_options)
-        with download_col:
-            # Add download all button for results
-            if len(st.session_state.previous_results) > 0:
-                try:
-                    # Create buffer in memory instead of file on disk
-                    import io
-                    from ocr_utils import create_results_zip_in_memory
-                    # Get zip data directly in memory
-                    zip_data = create_results_zip_in_memory(st.session_state.previous_results)
-                    # Create more informative ZIP filename with timestamp
-                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-                    # Count document types for a more descriptive filename
-                    pdf_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith('.pdf'))
-                    img_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png')))
-                    # Create more descriptive filename
-                    if pdf_count > 0 and img_count > 0:
-                        zip_filename = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip"
-                    elif pdf_count > 0:
-                        zip_filename = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip"
-                    elif img_count > 0:
-                        zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip"
                     else:
-                        zip_filename = f"historical_ocr_results_{timestamp}.zip"
-                    st.download_button(
-                        label="Download All Results",
-                        data=zip_data,
-                        file_name=zip_filename,
-                        mime="application/zip",
-                        help="Download all previous results as a ZIP file containing HTML and JSON files"
-                    )
-                except Exception as e:
-                    st.error(f"Error creating download: {str(e)}")
-                    st.info("Try with fewer results or individual downloads")
-        # Filter results based on selection
-        filtered_results = st.session_state.previous_results
-        if selected_filter == "PDF Documents":
-            filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith(".pdf")]
-        elif selected_filter == "Images":
-            filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png"))]
-        # Show a message if no results match the filter
-        if not filtered_results:
-            st.markdown("""
-            <div style="text-align: center; padding: 20px; background-color: #f9f9f9; border-radius: 5px; margin: 20px 0;">
-                <p>No results match the selected filter.</p>
-            </div>
-            """, unsafe_allow_html=True)
-        # Display each result as a card
-        for i, result in enumerate(filtered_results):
-            # Determine file type icon
-            file_name = result.get("file_name", f"Document {i+1}")
-            file_type_lower = file_name.lower()
-            if file_type_lower.endswith(".pdf"):
-                icon = "📄"
-            elif file_type_lower.endswith((".jpg", ".jpeg", ".png", ".gif")):
-                icon = "🖼️"
-            else:
-                icon = "📝"
-            # Create a card for each result
-            st.markdown(f"""
-            <div class="result-card">
-                <div class="result-header">
-                    <div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div>
-                    <div class="result-date">{result.get('timestamp', 'Unknown')}</div>
-                </div>
-                <div class="result-metadata">
-                    <div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div>
-                    <div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div>
-                </div>
-            """, unsafe_allow_html=True)
-            # Add view button inside the card with proper styling
-            st.markdown('<div class="result-action-button">', unsafe_allow_html=True)
-            if st.button(f"View Document", key=f"view_{i}"):
-                # Set the selected result in the session state
-                st.session_state.selected_previous_result = st.session_state.previous_results[i]
-                # Force a rerun to show the selected result
-                st.rerun()
-            st.markdown('</div>', unsafe_allow_html=True)
-            # Close the result card
-            st.markdown('</div>', unsafe_allow_html=True)
-        # Close the container
-        st.markdown('</div>', unsafe_allow_html=True)
         # Display the selected result if available
         if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
             selected_result = st.session_state.selected_previous_result
-            # Create a styled container for the selected result
-            st.markdown(f"""
-            <div class="selected-result-container">
-                <div class="result-header" style="margin-bottom: 20px;">
-                    <div class="selected-result-title">Selected Document: {selected_result.get('file_name', 'Unknown')}</div>
-                    <div class="result-date">{selected_result.get('timestamp', '')}</div>
-                </div>
-            """, unsafe_allow_html=True)
-            # Display metadata in a styled way
-            meta_col1, meta_col2 = st.columns(2)
-            with meta_col1:
-                # Display document metadata
-                if 'languages' in selected_result:
-                    languages = [lang for lang in selected_result['languages'] if lang is not None]
-                    if languages:
-                        st.write(f"**Languages:** {', '.join(languages)}")
-                if 'topics' in selected_result and selected_result['topics']:
-                    # Show topics in a more organized way with badges
-                    st.markdown("**Subject Tags:**")
-                    # Create a container with flex display for the tags
-                    st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True)
-                    # Generate a badge for each tag
-                    for topic in selected_result['topics']:
-                        # Create colored badge based on tag category
-                        badge_color = "#546e7a"  # Default color
-                        # Assign colors by category
-                        if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]):
-                            badge_color = "#1565c0"  # Blue for time periods
-                        elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
-                            badge_color = "#00695c"  # Teal for languages
-                        elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
-                            badge_color = "#6a1b9a"  # Purple for document types
-                        elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
-                            badge_color = "#2e7d32"  # Green for subject domains
-                        elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
-                            badge_color = "#e65100"  # Orange for preprocessing-related tags
-                        st.markdown(
-                            f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
-                            f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>',
-                            unsafe_allow_html=True
-                        )
-                    # Close the container
-                    st.markdown('</div>', unsafe_allow_html=True)
-            with meta_col2:
-                # Display processing metadata
-                if 'limited_pages' in selected_result:
-                    st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages")
-                if 'processing_time' in selected_result:
-                    proc_time = selected_result['processing_time']
-                    st.write(f"**Processing Time:** {proc_time:.1f}s")
-            # Create tabs for content display
             has_images = selected_result.get('has_images', False)
             if has_images:
-                view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"])
             else:
-                view_tab1, view_tab2 = st.tabs(["Structured View", "Raw Text"])
             with view_tab1:
-                # Display structured content
                 if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
                     for section, content in selected_result['ocr_contents'].items():
-                        if content and section not in ['error', 'raw_text', 'partial_text']:  # Skip error and raw text sections
-                            st.markdown(f"#### {section.replace('_', ' ').title()}")
                             if isinstance(content, str):
-                                st.write(content)
                             elif isinstance(content, list):
                                 for item in content:
-                                    if isinstance(item, str):
-                                        st.write(f"- {item}")
-                                    else:
-                                        st.write(f"- {str(item)}")
                             elif isinstance(content, dict):
                                 for k, v in content.items():
-                                    st.write(f"**{k}:** {v}")
             with view_tab2:
-                # Display raw text with editing capability
                 raw_text = ""
                 if 'ocr_contents' in selected_result:
                     if 'raw_text' in selected_result['ocr_contents']:
@@ -684,74 +956,91 @@ def display_previous_results():
                     elif 'content' in selected_result['ocr_contents']:
                         raw_text = selected_result['ocr_contents']['content']
-                # Allow editing of the raw text
-                edited_text = st.text_area("Edit Raw Text", raw_text, height=400, key="selected_raw_text")
-                # Add a button to copy the edited text to clipboard
-                if st.button("Copy to Clipboard", key="selected_copy_btn"):
-                    st.success("Text copied to clipboard! (You can paste it elsewhere)")
-                # Add a download button for the edited text
-                st.download_button(
-                    label="Download Edited Text",
-                    data=edited_text,
-                    file_name=f"{selected_result.get('file_name', 'document').split('.')[0]}_edited.txt",
-                    mime="text/plain",
-                    key="selected_download_btn"
-                )
             if has_images and 'pages_data' in selected_result:
                 with view_tab3:
-                    # Use the display_document_with_images function
-                    display_document_with_images(selected_result)
-            # Close the container
-            st.markdown('</div>', unsafe_allow_html=True)
-            # Add a button to close the selected result
-            if st.button("Close Selected Document", key="close_selected"):
-                # Clear the selected result from session state
-                del st.session_state.selected_previous_result
-                # Force a rerun to update the view
-                st.rerun()
 def display_about_tab():
     """Display about tab content"""
-    st.markdown('<h2>About Historical OCR</h2>', unsafe_allow_html=True)
     # Add app description
     st.markdown("""
     **Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
-    ### Purpose
     This tool is designed to assist scholars in historical research by extracting text from challenging documents.
     While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
     historical documents, particularly:
     - **Historical newspapers** with complex layouts and aged text
     - **Handwritten documents** from various time periods
     - **Photos of archival materials** that may be difficult to read
-    ### Features
     - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
     - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
     - **Editable Results**: Review and edit extracted text directly in the interface
     - **Structured Content Analysis**: Automatic organization of document content
     - **Multi-language Support**: Process documents in various languages
     - **PDF Processing**: Handle multi-page historical documents
-    ### How to Use
     1. Upload a document (PDF or image)
     2. Select the document type and adjust preprocessing options if needed
     3. Add custom processing instructions for specialized documents
     4. Process the document
     5. Review, edit, and download the results
-    ### Technologies
     - OCR processing using Mistral AI's advanced document understanding capabilities
     - Image preprocessing with OpenCV
     - PDF handling with pdf2image

 import os
 import io
 import base64
+import logging
 from datetime import datetime
 from pathlib import Path
 import json
 def create_sidebar_options():
     """Create and return sidebar options"""
     with st.sidebar:
+        st.markdown("## OCR Settings")
         # Create a container for the sidebar options
         with st.container():
             # Model selection
+            st.markdown("### Model Selection")
             use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure")
             # Document type selection
+            st.markdown("### Document Type")
             doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
                                    help="Select the type of document you're processing for better results")
                 st.markdown("**Custom Processing Instructions**")
                 custom_prompt = st.text_area("", value=custom_prompt,
                                            help="Customize the instructions for processing this document",
+                                           height=80)
             # Image preprocessing options in an expandable section
             with st.expander("Image Preprocessing"):
                                    help="Rotate image if needed")
             # Create preprocessing options dictionary
+            # Set document_type based on selection in UI
+            doc_type_for_preprocessing = "standard"
+            if "Handwritten" in doc_type:
+                doc_type_for_preprocessing = "handwritten"
+            elif "Newspaper" in doc_type or "Magazine" in doc_type:
+                doc_type_for_preprocessing = "newspaper"
+            elif "Book" in doc_type or "Publication" in doc_type:
+                doc_type_for_preprocessing = "printed"
             preprocessing_options = {
+                "document_type": doc_type_for_preprocessing,
                 "grayscale": grayscale,
                 "denoise": denoise,
                 "contrast": contrast,
             # PDF-specific options in an expandable section
             with st.expander("PDF Options"):
                 max_pages = st.number_input("Maximum Pages to Process",
                                           min_value=1,
                                           max_value=20,
                                           value=DEFAULT_MAX_PAGES,
                                           help="Limit the number of pages to process (for multi-page PDFs)")
+                # Set default values for removed options
+                pdf_dpi = DEFAULT_PDF_DPI
+                pdf_rotation = 0
             # Create options dictionary
             options = {
 def create_file_uploader():
     """Create and return a file uploader"""
     # Add app description
+    st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><div style="font-size: 32px;">📜</div><div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical OCR</h2></div></div>', unsafe_allow_html=True)
+    st.markdown("<p style='font-size: 0.8em; color: #666; text-align: left;'>Made possible by Mistral AI</p>", unsafe_allow_html=True)
     # Add project framing
     st.markdown("""
+    This tool assists scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy, it helps navigate:
+    - **Historical newspapers** with complex layouts
+    - **Handwritten documents** from various periods
+    - **Photos of archival materials**
+    Upload a document to begin, or explore the examples.
     """)
+    # Create file uploader with a more concise label
     uploaded_file = st.file_uploader(
+        "Select file",
+        type=["pdf", "png", "jpg"],
         help="Upload a PDF or image file for OCR processing"
     )
     return uploaded_file
 def display_results(result, container, custom_prompt=""):
     """Display OCR results in the provided container"""
     with container:
+        # No heading for document metadata - start directly with content
+        # Create a compact metadata section
+        meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">'
+        # Document type
+        if 'detected_document_type' in result:
+            meta_html += f'<div><strong>Type:</strong> {result["detected_document_type"]}</div>'
+        # Processing time
+        if 'processing_time' in result:
+            meta_html += f'<div><strong>Time:</strong> {result["processing_time"]:.1f}s</div>'
+        # Page information
+        if 'limited_pages' in result:
+            meta_html += f'<div><strong>Pages:</strong> {result["limited_pages"]["processed"]}/{result["limited_pages"]["total"]}</div>'
+        meta_html += '</div>'
+        st.markdown(meta_html, unsafe_allow_html=True)
+        # Language metadata on a separate line, Subject Tags below
+        # First show languages if available
+        if 'languages' in result and result['languages']:
+            languages = [lang for lang in result['languages'] if lang is not None]
+            if languages:
+                # Create a dedicated line for Languages
+                lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
+                lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>'
+                # Add language tags
+                for lang in languages:
+                    # Clean language name if needed
+                    clean_lang = str(lang).strip()
+                    if clean_lang:  # Only add if not empty
+                        lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>'
+                lang_html += '</div>'
+                st.markdown(lang_html, unsafe_allow_html=True)
+                # Create a separate line for Time if we have time-related tags
+                if 'topics' in result and result['topics']:
+                    time_tags = [topic for topic in result['topics']
+                               if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"])]
+                    if time_tags:
+                        time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
+                        time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
+                        for tag in time_tags:
+                            time_html += f'<span class="subject-tag tag-time-period">{tag}</span>'
+                        time_html += '</div>'
+                        st.markdown(time_html, unsafe_allow_html=True)
+        # Then display remaining subject tags if available
         if 'topics' in result and result['topics']:
+            # Filter out time-related tags which are already displayed
+            subject_tags = [topic for topic in result['topics']
+                         if not any(term in topic.lower() for term in ["century", "pre-", "era", "historical"])]
+            if subject_tags:
+                # Create a separate line for Subject Tags
+                tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
+                tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>'
+                tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">'
+                # Generate a badge for each remaining tag
+                for topic in subject_tags:
+                    # Determine tag category class
+                    tag_class = "subject-tag"  # Default class
+                    # Add specialized class based on category
+                    if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
+                        tag_class += " tag-language"  # Languages
+                    elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
+                        tag_class += " tag-document-type"  # Document types
+                    elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
+                        tag_class += " tag-subject"  # Subject domains
+                    # Add each tag as an inline span
+                    tags_html += f'<span class="{tag_class}">{topic}</span>'
+                # Close the containers
+                tags_html += '</div></div>'
+                # Render the subject tags section
+                st.markdown(tags_html, unsafe_allow_html=True)
+        # No OCR content heading - start directly with tabs
+            # Check if we have OCR content
+            if 'ocr_contents' in result:
+                # Create a single view instead of tabs
+                content_tab1 = st.container()
+                # Check for images in the result to use later
+                has_images = result.get('has_images', False)
+                has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
+                has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and
+                              any('images' in page for page in result['raw_response_data']['pages']
+                                  if isinstance(page, dict)))
+            # Display structured content
             with content_tab1:
+                # Display structured content with markdown formatting
                 if isinstance(result['ocr_contents'], dict):
+                    # CSS is now handled in the main layout.py file
+                    # Function to process text with markdown support
+                    def format_markdown_text(text):
+                        """Format text with markdown and handle special patterns"""
+                        if not text:
+                            return ""
+                        import re
+                        # First, ensure we're working with a string
+                        if not isinstance(text, str):
+                            text = str(text)
+                        # Ensure newlines are preserved for proper spacing
+                        # Convert any Windows line endings to Unix
+                        text = text.replace('\r\n', '\n')
+                        # Format dates (MM/DD/YYYY or similar patterns)
+                        date_pattern = r'\b(0?[1-9]|1[0-2])[\/\-\.](0?[1-9]|[12][0-9]|3[01])[\/\-\.](\d{4}|\d{2})\b'
+                        text = re.sub(date_pattern, r'**\g<0>**', text)
+                        # Detect markdown tables and preserve them
+                        table_sections = []
+                        non_table_lines = []
+                        in_table = False
+                        table_buffer = []
+                        # Process text line by line, preserving tables
+                        lines = text.split('\n')
+                        for i, line in enumerate(lines):
+                            line_stripped = line.strip()
+                            # Detect table rows by pipe character
+                            if '|' in line_stripped and (line_stripped.startswith('|') or line_stripped.endswith('|')):
+                                if not in_table:
+                                    in_table = True
+                                    if table_buffer:
+                                        table_buffer = []
+                                table_buffer.append(line)
+                                # Check if the next line is a table separator
+                                if i < len(lines) - 1 and '---' in lines[i+1] and '|' in lines[i+1]:
+                                    table_buffer.append(lines[i+1])
+                            # Detect table separators (---|---|---)
+                            elif in_table and '---' in line_stripped and '|' in line_stripped:
+                                table_buffer.append(line)
+                            # End of table detection
+                            elif in_table:
+                                # Check if this is still part of the table
+                                next_line_is_table = False
+                                if i < len(lines) - 1:
+                                    next_line = lines[i+1].strip()
+                                    if '|' in next_line and (next_line.startswith('|') or next_line.endswith('|')):
+                                        next_line_is_table = True
+                                if not next_line_is_table:
+                                    in_table = False
+                                    # Save the complete table
+                                    if table_buffer:
+                                        table_sections.append('\n'.join(table_buffer))
+                                        table_buffer = []
+                                    # Add current line to non-table lines
+                                    non_table_lines.append(line)
+                                else:
+                                    # Still part of the table
+                                    table_buffer.append(line)
+                            else:
+                                # Not in a table
+                                non_table_lines.append(line)
+                        # Handle any remaining table buffer
+                        if in_table and table_buffer:
+                            table_sections.append('\n'.join(table_buffer))
+                        # Process non-table lines
+                        processed_lines = []
+                        for line in non_table_lines:
+                            line_stripped = line.strip()
+                            # Check if line is in ALL CAPS (and not just a short acronym)
+                            if line_stripped and line_stripped.isupper() and len(line_stripped) > 3:
+                                # ALL CAPS line - make bold instead of heading to prevent large display
+                                processed_lines.append(f"**{line_stripped}**")
+                            # Process potential headers (lines ending with colon)
+                            elif line_stripped and line_stripped.endswith(':') and len(line_stripped) < 40:
+                                # Likely a header - make it bold
+                                processed_lines.append(f"**{line_stripped}**")
+                            else:
+                                # Keep original line with its spacing
+                                processed_lines.append(line)
+                        # Join non-table lines
+                        processed_text = '\n'.join(processed_lines)
+                        # Reinsert tables in the right positions
+                        for table in table_sections:
+                            # Generate a unique marker for this table
+                            marker = f"__TABLE_MARKER_{hash(table) % 10000}__"
+                            # Find a good position to insert this table
+                            # For now, just append all tables at the end
+                            processed_text += f"\n\n{table}\n\n"
+                        # Make sure paragraphs have proper spacing but not excessive
+                        processed_text = re.sub(r'\n{3,}', '\n\n', processed_text)
+                        # Ensure two newlines between paragraphs for proper markdown rendering
+                        processed_text = re.sub(r'([^\n])\n([^\n])', r'\1\n\n\2', processed_text)
+                        return processed_text
+                    # Collect all available images from the result
+                    available_images = []
+                    if has_images and 'pages_data' in result:
+                        for page_idx, page in enumerate(result['pages_data']):
+                            if 'images' in page and len(page['images']) > 0:
+                                for img_idx, img in enumerate(page['images']):
+                                    if 'image_base64' in img:
+                                        available_images.append({
+                                            'source': 'pages_data',
+                                            'page': page_idx,
+                                            'index': img_idx,
+                                            'data': img['image_base64']
+                                        })
+                    # Get images from raw response as well
+                    if 'raw_response_data' in result:
+                        raw_data = result['raw_response_data']
+                        if isinstance(raw_data, dict) and 'pages' in raw_data:
+                            for page_idx, page in enumerate(raw_data['pages']):
+                                if isinstance(page, dict) and 'images' in page:
+                                    for img_idx, img in enumerate(page['images']):
+                                        if isinstance(img, dict) and 'base64' in img:
+                                            available_images.append({
+                                                'source': 'raw_response',
+                                                'page': page_idx,
+                                                'index': img_idx,
+                                                'data': img['base64']
+                                            })
+                    # Extract images for display at the top
+                    images_to_display = []
+                    # First, collect all available images
+                    for img_idx, img in enumerate(available_images):
+                        if 'data' in img:
+                            images_to_display.append({
+                                'data': img['data'],
+                                'id': img.get('id', f"img_{img_idx}"),
+                                'index': img_idx
+                            })
+                    # Display images at the top if available
+                    if images_to_display:
+                        st.markdown("### Document Images")
+                        # Create columns for a grid layout (up to 2 columns to make images larger)
+                        cols_count = min(2, len(images_to_display))
+                        image_cols = st.columns(cols_count)
+                        # Display each image in a column with minimal spacing
+                        for i, img in enumerate(images_to_display):
+                            with image_cols[i % cols_count]:
+                                # Compact image display
+                                st.image(img['data'], use_container_width=True)
+                                st.markdown(f"<p style='margin-top:-5px; font-size:0.8rem; color:#666; text-align:center;'>Document Image {i+1}</p>", unsafe_allow_html=True)
+                    # Organize sections in a logical order
+                    section_order = ["title", "author", "date", "summary", "content", "transcript", "metadata"]
+                    ordered_sections = []
+                    # Add known sections first in preferred order
+                    for section_name in section_order:
+                        if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
+                            ordered_sections.append(section_name)
+                    # Add any remaining sections
+                    for section in result['ocr_contents'].keys():
+                        if (section not in ordered_sections and
+                            section not in ['error', 'partial_text'] and
+                            result['ocr_contents'][section]):
+                            ordered_sections.append(section)
+                    # If only raw_text is available and no other content, add it last
+                    if ('raw_text' in result['ocr_contents'] and
+                        result['ocr_contents']['raw_text'] and
+                        len(ordered_sections) == 0):
+                        ordered_sections.append('raw_text')
+                    # Add minimal spacing before OCR results
+                    st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True)
+                    st.markdown("### Document Content")
+                    # Process each section using expanders
+                    for i, section in enumerate(ordered_sections):
+                        content = result['ocr_contents'][section]
+                        # Skip empty content
+                        if not content:
+                            continue
+                        # Create an expander for each section
+                        # First section is expanded by default
+                        with st.expander(f"{section.replace('_', ' ').title()}", expanded=(i == 0)):
                             if isinstance(content, str):
+                                # Handle image markdown
+                                if content.startswith("![") and content.endswith(")"):
+                                    try:
+                                        alt_text = content[2:content.index(']')]
+                                        st.info(f"Image description: {alt_text if len(alt_text) > 5 else 'Image'}")
+                                    except:
+                                        st.info("Contains image reference")
+                                else:
+                                    # Process text content
+                                    formatted_content = format_markdown_text(content).strip()
+                                    # Check if content contains markdown tables or complex text
+                                    has_tables = '|' in formatted_content and '---' in formatted_content
+                                    has_complex_structure = formatted_content.count('\n') > 5 or formatted_content.count('**') > 2
+                                    # Use a container with minimal margins
+                                    with st.container():
+                                        # For text-only extractions or content with tables, ensure proper rendering
+                                        if has_tables or has_complex_structure:
+                                            # For text with tables or multiple paragraphs, use special handling
+                                            # First ensure proper markdown spacing
+                                            formatted_content = formatted_content.replace('\n\n\n', '\n\n')
+                                            # Look for any all caps headers that might be misinterpreted
+                                            import re
+                                            formatted_content = re.sub(
+                                                r'^([A-Z][A-Z\s]+)$',
+                                                r'**\1**',
+                                                formatted_content,
+                                                flags=re.MULTILINE
+                                            )
+                                            # Preserve table formatting by adding proper spacing
+                                            if has_tables:
+                                                formatted_content = formatted_content.replace('\n|', '\n\n|')
+                                            # Add proper paragraph spacing
+                                            formatted_content = re.sub(r'([^\n])\n([^\n])', r'\1\n\n\2', formatted_content)
+                                            # Use standard markdown with custom styling
+                                            st.markdown(formatted_content, unsafe_allow_html=False)
+                                        else:
+                                            # For simpler content, use standard markdown
+                                            st.markdown(formatted_content)
                             elif isinstance(content, list):
+                                # Create markdown list
+                                list_items = []
                                 for item in content:
                                     if isinstance(item, str):
+                                        item_text = format_markdown_text(item).strip()
+                                        # Handle potential HTML special characters for proper rendering
+                                        item_text = item_text.replace('<', '&lt;').replace('>', '&gt;')
+                                        list_items.append(f"- {item_text}")
                                     else:
+                                        list_items.append(f"- {str(item)}")
+                                list_content = "\n".join(list_items)
+                                # Use a container with minimal margins
+                                with st.container():
+                                    # Use standard markdown for better rendering
+                                    st.markdown(list_content)
                             elif isinstance(content, dict):
+                                # Format dictionary content
+                                dict_items = []
                                 for k, v in content.items():
+                                    key_formatted = k.replace('_', ' ').title()
+                                    if isinstance(v, str):
+                                        value_formatted = format_markdown_text(v).strip()
+                                        dict_items.append(f"**{key_formatted}:** {value_formatted}")
+                                    else:
+                                        dict_items.append(f"**{key_formatted}:** {str(v)}")
+                                dict_content = "\n".join(dict_items)
+                                # Use a container with minimal margins
+                                with st.container():
+                                    # Use standard markdown for better rendering
+                                    st.markdown(dict_content)
+            # Display custom prompt if provided
+            if custom_prompt:
+                with st.expander("Custom Processing Instructions"):
+                    st.write(custom_prompt)
+            # No download heading - start directly with buttons
+            # JSON download - use full width for buttons
             try:
                 json_str = json.dumps(result, indent=2)
                 st.download_button(
                 )
             except Exception as e:
                 st.error(f"Error creating JSON download: {str(e)}")
             # Text download
             try:
                 if 'ocr_contents' in result:
 def display_document_with_images(result):
     """Display document with images"""
+    # Check for pages_data first
+    if 'pages_data' in result and result['pages_data']:
+        pages_data = result['pages_data']
+    # If pages_data not available, try to extract from raw_response_data
+    elif 'raw_response_data' in result and isinstance(result['raw_response_data'], dict) and 'pages' in result['raw_response_data']:
+        # Build pages_data from raw_response_data
+        pages_data = []
+        raw_pages = result['raw_response_data']['pages']
+        for page_idx, page in enumerate(raw_pages):
+            if not isinstance(page, dict):
+                continue
+            page_data = {
+                'page_number': page_idx + 1,
+                'markdown': page.get('markdown', ''),
+                'images': []
+            }
+            # Extract images if present
+            if 'images' in page and isinstance(page['images'], list):
+                for img_idx, img in enumerate(page['images']):
+                    if isinstance(img, dict) and ('base64' in img or 'image_base64' in img):
+                        img_base64 = img.get('image_base64', img.get('base64', ''))
+                        if img_base64:
+                            page_data['images'].append({
+                                'id': img.get('id', f"img_{page_idx}_{img_idx}"),
+                                'image_base64': img_base64
+                            })
+            if page_data['markdown'] or page_data['images']:
+                pages_data.append(page_data)
+    else:
         st.info("No image data available.")
         return
     # Display each page
+    for i, page_data in enumerate(pages_data):
         st.markdown(f"### Page {i+1}")
         # Create columns for image and text
         img_col, text_col = st.columns([1, 1])
         with img_col:
+            # Display the image - check multiple possible field names
+            image_displayed = False
+            # Try 'image_data' field first
             if 'image_data' in page_data:
                 try:
                     # Convert base64 to image
                     image_data = base64.b64decode(page_data['image_data'])
                     st.image(io.BytesIO(image_data), use_container_width=True)
+                    image_displayed = True
                 except Exception as e:
+                    st.error(f"Error displaying image from image_data: {str(e)}")
+            # Try 'images' array if image_data didn't work
+            if not image_displayed and 'images' in page_data and len(page_data['images']) > 0:
+                for img in page_data['images']:
+                    if 'image_base64' in img:
+                        try:
+                            st.image(img['image_base64'], use_container_width=True)
+                            image_displayed = True
+                            break
+                        except Exception as e:
+                            st.error(f"Error displaying image from images array: {str(e)}")
+            if not image_displayed:
                 st.info("No image available for this page.")
         with text_col:
+            # Get text from various possible fields
+            page_text = ""
             if 'text' in page_data:
+                page_text = page_data['text']
+            elif 'markdown' in page_data:
+                page_text = page_data['markdown']
+            # Special handling for image markdown in page data
+            if page_text.startswith("![") and page_text.endswith(")"):
+                # Try to display image if not already displayed
+                if not image_displayed and 'raw_response_data' in result:
+                    raw_data = result['raw_response_data']
+                    if isinstance(raw_data, dict) and 'pages' in raw_data:
+                        for raw_page in raw_data['pages']:
+                            if isinstance(raw_page, dict) and 'images' in raw_page:
+                                for img in raw_page['images']:
+                                    if isinstance(img, dict) and 'base64' in img:
+                                        st.image(img['base64'])
+                                        st.caption("Image from OCR response")
+                                        image_displayed = True
+                                        break
+                                if image_displayed:
+                                    break
+                # Try to extract alt text
+                try:
+                    alt_text = page_text[2:page_text.index(']')]
+                    if alt_text and len(alt_text) > 5:  # Only show if alt text is meaningful
+                        st.info(f"Image description: {alt_text}")
+                    else:
+                        st.info("This page contains an image with minimal text")
+                except:
+                    st.info("This page contains an image with minimal text")
+                # Show warning if no image displayed
+                if not image_displayed:
+                    st.warning("Image reference found in text, but no image data is available.")
+            # If no text found but we have raw_text in ocr_contents
+            if not page_text and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
+                page_text = result['ocr_contents']['raw_text']
+            # Display the text with editing capability
+            if page_text:
+                edited_text = st.text_area(f"Page {i+1} Text", page_text, height=300, key=f"page_text_{i}")
+                # Add a simple button to copy the edited text to clipboard
+                st.button(f"Copy Text", key=f"copy_btn_{i}")
             else:
                 st.info("No text available for this page.")
 def display_previous_results():
+    """Display previous results tab content in a simplified, structured view"""
+    # Use a clean header with the download button directly next to it
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        st.header("Previous Results")
     # Display previous results if available
     if not st.session_state.previous_results:
         st.markdown("""
+        <div style="text-align: center; padding: 30px 20px; background-color: #f8f9fa; border-radius: 6px; margin-top: 10px;">
+            <div style="font-size: 36px; margin-bottom: 15px;">📄</div>
+            <h4 style="margin-bottom: 8px; font-weight: 500;">No Previous Results</h4>
+            <p style="font-size: 14px; color: #666;">Process a document to see your results history.</p>
         </div>
         """, unsafe_allow_html=True)
     else:
+        # Add download button in the second column next to the header
+        with col2:
+            try:
+                # Create download button for all results
+                from ocr_utils import create_results_zip_in_memory
+                zip_data = create_results_zip_in_memory(st.session_state.previous_results)
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                # Simplified filename
+                zip_filename = f"ocr_results_{timestamp}.zip"
+                st.download_button(
+                    label="Download All",
+                    data=zip_data,
+                    file_name=zip_filename,
+                    mime="application/zip",
+                    help="Download all results as ZIP"
+                )
+            except Exception:
+                # Silent fail - no error message to keep UI clean
+                pass
+        # Create a cleaner, more minimal grid for results using Streamlit columns
+        # Calculate number of columns based on screen width - more responsive
+        num_columns = 2  # Two columns for most screens
+        # Create rows of result cards
+        for i in range(0, len(st.session_state.previous_results), num_columns):
+            # Create a row of columns
+            cols = st.columns(num_columns)
+            # Fill each column with a result card
+            for j in range(num_columns):
+                index = i + j
+                if index < len(st.session_state.previous_results):
+                    result = st.session_state.previous_results[index]
+                    # Get basic info for the card
+                    file_name = result.get("file_name", f"Document {index+1}")
+                    timestamp = result.get("timestamp", "")
+                    # Determine file type icon
+                    if file_name.lower().endswith(".pdf"):
+                        icon = "📄"
+                    elif any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif"]):
+                        icon = "🖼️"
                     else:
+                        icon = "📝"
+                    # Display a simplified card in each column
+                    with cols[j]:
+                        # Use a container for better styling control
+                        with st.container():
+                            # Create visually cleaner card with less vertical space
+                            st.markdown(f"""
+                            <div style="padding: 10px; border: 1px solid #e0e0e0; border-radius: 6px; margin-bottom: 10px;">
+                                <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 5px;">
+                                    <div style="font-weight: 500; font-size: 14px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">{icon} {file_name}</div>
+                                    <div style="color: #666; font-size: 12px;">{timestamp.split()[0] if timestamp else ""}</div>
+                                </div>
+                            </div>
+                            """, unsafe_allow_html=True)
+                            # Add a simple button below each card
+                            if st.button(f"View", key=f"view_{index}", help=f"View {file_name}"):
+                                st.session_state.selected_previous_result = st.session_state.previous_results[index]
+                                st.rerun()
         # Display the selected result if available
         if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
             selected_result = st.session_state.selected_previous_result
+            # Draw a separator between results list and selected document
+            st.markdown("<hr style='margin: 20px 0 15px 0; border: none; height: 1px; background-color: #eee;'>", unsafe_allow_html=True)
+            # Create a cleaner header for the selected document
+            file_name = selected_result.get('file_name', 'Document')
+            st.subheader(f"{file_name}")
+            # Add a simple back button at the top
+            if st.button("← Back to Results", key="back_to_results"):
+                if 'selected_previous_result' in st.session_state:
+                    del st.session_state.selected_previous_result
+                st.session_state.perform_reset = True
+                st.rerun()
+            # Simplified metadata display - just one line with essential info
+            meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 12px; margin: 8px 0 15px 0; font-size: 14px; color: #666;">'
+            # Add timestamp
+            if 'timestamp' in selected_result:
+                meta_html += f'<div>{selected_result["timestamp"]}</div>'
+            # Add languages if available (simplified)
+            if 'languages' in selected_result and selected_result['languages']:
+                languages = [lang for lang in selected_result['languages'] if lang is not None]
+                if languages:
+                    meta_html += f'<div>Language: {", ".join(languages)}</div>'
+            # Add page count if available (simplified)
+            if 'limited_pages' in selected_result:
+                meta_html += f'<div>Pages: {selected_result["limited_pages"]["processed"]}/{selected_result["limited_pages"]["total"]}</div>'
+            meta_html += '</div>'
+            st.markdown(meta_html, unsafe_allow_html=True)
+            # Simplified tabs - fewer options for cleaner interface
             has_images = selected_result.get('has_images', False)
             if has_images:
+                view_tabs = st.tabs(["Document Content", "Raw Text", "Images"])
+                view_tab1, view_tab2, view_tab3 = view_tabs
             else:
+                view_tabs = st.tabs(["Document Content", "Raw Text"])
+                view_tab1, view_tab2 = view_tabs
+            # Define helper function for formatting text
+            def format_text_display(text):
+                if not isinstance(text, str):
+                    return text
+                lines = text.split('\n')
+                processed_lines = []
+                for line in lines:
+                    line_stripped = line.strip()
+                    if line_stripped and line_stripped.isupper() and len(line_stripped) > 3:
+                        processed_lines.append(f"**{line_stripped}**")
+                    else:
+                        processed_lines.append(line)
+                return '\n'.join(processed_lines)
+            # First tab - Document Content (simplified structured view)
             with view_tab1:
+                # Display content in a cleaner, more streamlined format
                 if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
+                    # Create a more focused list of important sections
+                    priority_sections = ["title", "content", "transcript", "summary", "raw_text"]
+                    displayed_sections = set()
+                    # First display priority sections
+                    for section in priority_sections:
+                        if section in selected_result['ocr_contents'] and selected_result['ocr_contents'][section]:
+                            content = selected_result['ocr_contents'][section]
+                            if isinstance(content, str) and content.strip():
+                                # Only add a subheader for meaningful section names, not raw_text
+                                if section != "raw_text":
+                                    st.markdown(f"##### {section.replace('_', ' ').title()}")
+                                # Format and display content
+                                formatted_content = format_text_display(content)
+                                st.markdown(formatted_content)
+                                displayed_sections.add(section)
+                    # Then display any remaining sections not already shown
                     for section, content in selected_result['ocr_contents'].items():
+                        if (section not in displayed_sections and
+                            section not in ['error', 'partial_text'] and
+                            content):
+                            st.markdown(f"##### {section.replace('_', ' ').title()}")
                             if isinstance(content, str):
+                                st.markdown(format_text_display(content))
                             elif isinstance(content, list):
                                 for item in content:
+                                    st.markdown(f"- {item}")
                             elif isinstance(content, dict):
                                 for k, v in content.items():
+                                    st.markdown(f"**{k}:** {v}")
+            # Second tab - Raw Text (simplified)
             with view_tab2:
+                # Extract raw text or content
                 raw_text = ""
                 if 'ocr_contents' in selected_result:
                     if 'raw_text' in selected_result['ocr_contents']:
                     elif 'content' in selected_result['ocr_contents']:
                         raw_text = selected_result['ocr_contents']['content']
+                # Display the text area with raw text
+                edited_text = st.text_area("", raw_text, height=300, key="selected_raw_text")
+                # Add buttons in a row
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.button("Copy Text", key="selected_copy_btn")
+                with col2:
+                    st.download_button(
+                        label="Download Text",
+                        data=edited_text,
+                        file_name=f"{file_name.split('.')[0]}_text.txt",
+                        mime="text/plain",
+                        key="selected_download_btn"
+                    )
+            # Third tab - With Images (simplified)
             if has_images and 'pages_data' in selected_result:
                 with view_tab3:
+                    # Simplified image display
+                    if 'pages_data' in selected_result:
+                        for i, page_data in enumerate(selected_result['pages_data']):
+                            # Display each page
+                            if 'images' in page_data and len(page_data['images']) > 0:
+                                for img in page_data['images']:
+                                    if 'image_base64' in img:
+                                        st.image(img['image_base64'], use_column_width=True)
+                                        # Get page text if available
+                                        page_text = ""
+                                        if 'markdown' in page_data:
+                                            page_text = page_data['markdown']
+                                        # Display text if available
+                                        if page_text:
+                                            with st.expander(f"Page {i+1} Text", expanded=False):
+                                                st.text(page_text)
 def display_about_tab():
     """Display about tab content"""
+    st.header("About")
     # Add app description
     st.markdown("""
     **Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials.
+    """)
+    # Purpose section with consistent formatting
+    st.markdown("### Purpose")
+    st.markdown("""
     This tool is designed to assist scholars in historical research by extracting text from challenging documents.
     While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating
     historical documents, particularly:
+    """)
+    st.markdown("""
     - **Historical newspapers** with complex layouts and aged text
     - **Handwritten documents** from various time periods
     - **Photos of archival materials** that may be difficult to read
+    """)
+    # Features section with consistent formatting
+    st.markdown("### Features")
+    st.markdown("""
     - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
     - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
     - **Editable Results**: Review and edit extracted text directly in the interface
     - **Structured Content Analysis**: Automatic organization of document content
     - **Multi-language Support**: Process documents in various languages
     - **PDF Processing**: Handle multi-page historical documents
+    """)
+    # How to Use section with consistent formatting
+    st.markdown("### How to Use")
+    st.markdown("""
     1. Upload a document (PDF or image)
     2. Select the document type and adjust preprocessing options if needed
     3. Add custom processing instructions for specialized documents
     4. Process the document
     5. Review, edit, and download the results
+    """)
+    # Technologies section with consistent formatting
+    st.markdown("### Technologies")
+    st.markdown("""
     - OCR processing using Mistral AI's advanced document understanding capabilities
     - Image preprocessing with OpenCV
     - PDF handling with pdf2image

utils.py CHANGED Viewed

@@ -13,12 +13,76 @@ logger = logging.getLogger("utils")
 logger.setLevel(logging.INFO)
 def get_base64_from_image(image_path):
-    """Get base64 string from image file"""
     try:
-        with open(image_path, "rb") as img_file:
-            return base64.b64encode(img_file.read()).decode('utf-8')
     except Exception as e:
-        logger.error(f"Error encoding image to base64: {str(e)}")
         return ""
 def timing(description):

 logger.setLevel(logging.INFO)
 def get_base64_from_image(image_path):
+    """
+    Get base64 data URL from image file with proper MIME type.
+    Args:
+        image_path: Path to the image file
+    Returns:
+        Base64 data URL with appropriate MIME type prefix
+    """
+    try:
+        # Convert to Path object for better handling
+        path_obj = Path(image_path)
+        # Determine mime type based on file extension
+        mime_type = 'image/jpeg'  # Default mime type
+        suffix = path_obj.suffix.lower()
+        if suffix == '.png':
+            mime_type = 'image/png'
+        elif suffix == '.gif':
+            mime_type = 'image/gif'
+        elif suffix in ['.jpg', '.jpeg']:
+            mime_type = 'image/jpeg'
+        elif suffix == '.pdf':
+            mime_type = 'application/pdf'
+        # Read and encode file
+        with open(path_obj, "rb") as file:
+            encoded = base64.b64encode(file.read()).decode('utf-8')
+            return f"data:{mime_type};base64,{encoded}"
+    except Exception as e:
+        logger.error(f"Error encoding file to base64: {str(e)}")
+        return ""
+def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None):
+    """
+    Get base64 data URL from file bytes with proper MIME type.
+    Args:
+        file_bytes: Binary file data
+        mime_type: MIME type of the file (optional)
+        file_name: Original file name for MIME type detection (optional)
+    Returns:
+        Base64 data URL with appropriate MIME type prefix
+    """
     try:
+        # Determine mime type if not provided
+        if mime_type is None and file_name is not None:
+            # Get file extension
+            suffix = Path(file_name).suffix.lower()
+            if suffix == '.png':
+                mime_type = 'image/png'
+            elif suffix == '.gif':
+                mime_type = 'image/gif'
+            elif suffix in ['.jpg', '.jpeg']:
+                mime_type = 'image/jpeg'
+            elif suffix == '.pdf':
+                mime_type = 'application/pdf'
+            else:
+                # Default to octet-stream for unknown types
+                mime_type = 'application/octet-stream'
+        elif mime_type is None:
+            # Default MIME type if we can't determine it
+            mime_type = 'application/octet-stream'
+        # Encode and create data URL
+        encoded = base64.b64encode(file_bytes).decode('utf-8')
+        return f"data:{mime_type};base64,{encoded}"
     except Exception as e:
+        logger.error(f"Error encoding bytes to base64: {str(e)}")
         return ""
 def timing(description):