import streamlit as st import os import io import base64 import logging import re from datetime import datetime from pathlib import Path import json # Define exports __all__ = [ 'ProgressReporter', 'create_sidebar_options', 'create_file_uploader', 'display_document_with_images', 'display_previous_results', 'display_about_tab', 'display_results' # Re-export from utils.ui_utils ] from constants import ( DOCUMENT_TYPES, DOCUMENT_LAYOUTS, CUSTOM_PROMPT_TEMPLATES, LAYOUT_PROMPT_ADDITIONS, DEFAULT_PDF_DPI, MIN_PDF_DPI, MAX_PDF_DPI, DEFAULT_MAX_PAGES, PERFORMANCE_MODES, PREPROCESSING_DOC_TYPES, ROTATION_OPTIONS ) from utils.text_utils import format_ocr_text, clean_raw_text, format_markdown_text # Import from text_utils from utils.content_utils import ( classify_document_content, extract_document_text, extract_image_description ) from utils.ui_utils import display_results from preprocessing import preprocess_image class ProgressReporter: """Class to handle progress reporting in the UI""" def __init__(self, placeholder): self.placeholder = placeholder self.progress_bar = None self.status_text = None def setup(self): """Setup the progress components""" with self.placeholder.container(): self.progress_bar = st.progress(0) self.status_text = st.empty() return self def update(self, percent, status_text): """Update the progress bar and status text""" if self.progress_bar is not None: self.progress_bar.progress(percent / 100) if self.status_text is not None: self.status_text.text(status_text) def complete(self, success=True): """Complete the progress reporting""" if success: if self.progress_bar is not None: self.progress_bar.progress(100) if self.status_text is not None: self.status_text.text("Processing complete!") else: if self.status_text is not None: self.status_text.text("Processing failed.") # Clear the progress components after a delay import time time.sleep(0.8) # Short delay to show completion if self.progress_bar is not None: self.progress_bar.empty() if self.status_text is not None: self.status_text.empty() def create_sidebar_options(): """Create and return sidebar options""" with st.sidebar: st.markdown("## OCR Settings") # Create a container for the sidebar options with st.container(): # Default to using vision model (removed selection from UI) use_vision = True # Document type selection doc_type = st.selectbox("Document Type", DOCUMENT_TYPES, help="Select the type of document you're processing for better results") # Document layout doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS, help="Select the layout of your document") # Initialize preprocessing variables with default values grayscale = False denoise = False contrast = 0 rotation = 0 use_segmentation = False # Custom prompt custom_prompt = "" # Get the template for the selected document type if not auto-detect if doc_type != DOCUMENT_TYPES[0]: prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "") # Add layout information if not standard if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "") if layout_addition: prompt_template += " " + layout_addition # Set the custom prompt custom_prompt = prompt_template # Allow user to edit the prompt (always visible) custom_prompt = st.text_area("Custom Processing Instructions", value=custom_prompt, help="Customize the instructions for processing this document", height=80) # Image preprocessing options (always visible) st.markdown("### Image Preprocessing") # Grayscale conversion grayscale = st.checkbox("Convert to Grayscale", value=True, help="Convert color images to grayscale for better text recognition") # Light denoising option denoise = st.checkbox("Light Denoising", value=True, help="Apply gentle denoising to improve text clarity") # Contrast adjustment contrast = st.slider("Contrast Adjustment", min_value=-20, max_value=20, value=5, step=5, help="Adjust image contrast (limited range)") # Initialize rotation (keeping it set to 0) rotation = 0 use_segmentation = False # Create preprocessing options dictionary # Map UI document types to preprocessing document types doc_type_for_preprocessing = "standard" if "Handwritten" in doc_type: doc_type_for_preprocessing = "handwritten" elif "Newspaper" in doc_type or "Magazine" in doc_type: doc_type_for_preprocessing = "newspaper" elif "Book" in doc_type or "Publication" in doc_type: doc_type_for_preprocessing = "book" # Match the actual preprocessing type preprocessing_options = { "document_type": doc_type_for_preprocessing, "grayscale": grayscale, "denoise": denoise, "contrast": contrast, "rotation": rotation } # PDF-specific options st.markdown("### PDF Options") max_pages = st.number_input("Maximum Pages to Process", min_value=1, max_value=20, value=DEFAULT_MAX_PAGES, help="Limit the number of pages to process (for multi-page PDFs)") # Set default values for removed options pdf_dpi = DEFAULT_PDF_DPI pdf_rotation = 0 # Create options dictionary options = { "use_vision": use_vision, "perf_mode": "Quality", # Default to Quality, removed performance mode option "pdf_dpi": pdf_dpi, "max_pages": max_pages, "pdf_rotation": pdf_rotation, "custom_prompt": custom_prompt, "preprocessing_options": preprocessing_options, "use_segmentation": use_segmentation if 'use_segmentation' in locals() else False } return options def create_file_uploader(): """Create and return a file uploader""" # Add app description st.markdown(f'
📜

Historical OCR

', unsafe_allow_html=True) st.markdown("

Made possible by Mistral AI

", unsafe_allow_html=True) # Add project framing st.markdown(""" This tool assists scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy, it helps navigate: - **Historical newspapers** with complex layouts - **Handwritten documents** from various periods - **Photos of archival materials** Upload a document to begin, or explore the examples. """) # Create file uploader with a more concise label uploaded_file = st.file_uploader( "Select file", type=["pdf", "png", "jpg"], help="Upload a PDF or image file for OCR processing" ) return uploaded_file def display_document_with_images(result): """Display document with images""" # Check for pages_data first if 'pages_data' in result and result['pages_data']: pages_data = result['pages_data'] # If pages_data not available, try to extract from raw_response_data elif 'raw_response_data' in result and isinstance(result['raw_response_data'], dict) and 'pages' in result['raw_response_data']: # Build pages_data from raw_response_data pages_data = [] raw_pages = result['raw_response_data']['pages'] for page_idx, page in enumerate(raw_pages): if not isinstance(page, dict): continue page_data = { 'page_number': page_idx + 1, 'markdown': page.get('markdown', ''), 'images': [] } # Extract images if present if 'images' in page and isinstance(page['images'], list): for img_idx, img in enumerate(page['images']): if isinstance(img, dict) and ('base64' in img or 'image_base64' in img): img_base64 = img.get('image_base64', img.get('base64', '')) if img_base64: page_data['images'].append({ 'id': img.get('id', f"img_{page_idx}_{img_idx}"), 'image_base64': img_base64 }) if page_data['markdown'] or page_data['images']: pages_data.append(page_data) else: st.info("No image data available.") return # Display each page for i, page_data in enumerate(pages_data): st.markdown(f"### Page {i+1}") # Display only the image (removed text column) # Display the image - check multiple possible field names image_displayed = False # Try 'image_data' field first if 'image_data' in page_data: try: # Convert base64 to image image_data = base64.b64decode(page_data['image_data']) st.image(io.BytesIO(image_data), use_container_width=True) image_displayed = True except Exception as e: st.error(f"Error displaying image from image_data: {str(e)}") # Try 'images' array if image_data didn't work if not image_displayed and 'images' in page_data and len(page_data['images']) > 0: for img in page_data['images']: if 'image_base64' in img: try: st.image(img['image_base64'], use_container_width=True) image_displayed = True break except Exception as e: st.error(f"Error displaying image from images array: {str(e)}") # Try alternative image source if still not displayed if not image_displayed and 'raw_response_data' in result: raw_data = result['raw_response_data'] if isinstance(raw_data, dict) and 'pages' in raw_data: for raw_page in raw_data['pages']: if isinstance(raw_page, dict) and 'images' in raw_page: for img in raw_page['images']: if isinstance(img, dict) and 'base64' in img: st.image(img['base64'], use_container_width=True) st.caption("Image from OCR response") image_displayed = True break if image_displayed: break if not image_displayed: st.info("No image available for this page.") # Extract and display alt text if available page_text = "" if 'text' in page_data: page_text = page_data['text'] elif 'markdown' in page_data: page_text = page_data['markdown'] if page_text and page_text.startswith("![") and page_text.endswith(")"): try: alt_text = page_text[2:page_text.index(']')] if alt_text and len(alt_text) > 5: # Only show if alt text is meaningful st.caption(f"Image description: {alt_text}") except: pass def display_previous_results(): """Display previous results tab content in a simplified, structured view""" # Use a simple header without the button column st.header("Previous Results") # Display previous results if available if not st.session_state.previous_results: st.markdown("""
📄
No Previous Results

Process a document to see your results history.

""", unsafe_allow_html=True) else: # Prepare zip download outside of the UI flow try: # Create download button for all results from utils.image_utils import create_results_zip_in_memory zip_data = create_results_zip_in_memory(st.session_state.previous_results) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Simplified filename zip_filename = f"ocr_results_{timestamp}.zip" # Encode the zip data for direct download link zip_b64 = base64.b64encode(zip_data).decode() # Add styled download tag in the metadata section download_html = '
' download_html += '
Download:
' download_html += f'All Results' download_html += '
' st.markdown(download_html, unsafe_allow_html=True) except Exception: # Silent fail - no error message to keep UI clean pass # Create a cleaner, more minimal grid for results using Streamlit columns # Calculate number of columns based on screen width - more responsive num_columns = 2 # Two columns for most screens # Create rows of result cards for i in range(0, len(st.session_state.previous_results), num_columns): # Create a row of columns cols = st.columns(num_columns) # Fill each column with a result card for j in range(num_columns): index = i + j if index < len(st.session_state.previous_results): result = st.session_state.previous_results[index] # Get basic info for the card file_name = result.get("file_name", f"Document {index+1}") timestamp = result.get("timestamp", "") # Determine file type icon if file_name.lower().endswith(".pdf"): icon = "📄" elif any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif"]): icon = "🖼️" else: icon = "📝" # Display a simplified card in each column with cols[j]: # Use a container for better styling control with st.container(): # Create visually cleaner card with less vertical space st.markdown(f"""
{icon} {file_name}
{timestamp.split()[0] if timestamp else ""}
""", unsafe_allow_html=True) # Add a simple button below each card if st.button(f"View", key=f"view_{index}", help=f"View {file_name}"): st.session_state.selected_previous_result = st.session_state.previous_results[index] st.rerun() # Display the selected result if available if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result: selected_result = st.session_state.selected_previous_result # Draw a separator between results list and selected document st.markdown("
", unsafe_allow_html=True) # Create a cleaner header for the selected document file_name = selected_result.get('file_name', 'Document') st.subheader(f"{file_name}") # Add a simple back button at the top if st.button("← Back to Results", key="back_to_results"): if 'selected_previous_result' in st.session_state: del st.session_state.selected_previous_result st.session_state.perform_reset = True st.rerun() # Simplified metadata display - just one line with essential info meta_html = '
' # Add timestamp if 'timestamp' in selected_result: meta_html += f'
{selected_result["timestamp"]}
' # Add languages if available (simplified) if 'languages' in selected_result and selected_result['languages']: languages = [lang for lang in selected_result['languages'] if lang is not None] if languages: meta_html += f'
Language: {", ".join(languages)}
' # Add page count if available (simplified) if 'limited_pages' in selected_result: meta_html += f'
Pages: {selected_result["limited_pages"]["processed"]}/{selected_result["limited_pages"]["total"]}
' meta_html += '
' st.markdown(meta_html, unsafe_allow_html=True) # Simplified tabs - using the same format as main view has_images = selected_result.get('has_images', False) if has_images: view_tabs = st.tabs(["Document Content", "Raw JSON", "Images"]) view_tab1, view_tab2, view_tab3 = view_tabs else: view_tabs = st.tabs(["Document Content", "Raw JSON"]) view_tab1, view_tab2 = view_tabs view_tab3 = None # First tab - Document Content (simplified structured view) with view_tab1: # Display content in a cleaner, more streamlined format if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict): # Create a more focused list of important sections priority_sections = ["title", "content", "transcript", "summary"] displayed_sections = set() # First display priority sections for section in priority_sections: if section in selected_result['ocr_contents'] and selected_result['ocr_contents'][section]: content = selected_result['ocr_contents'][section] if isinstance(content, str) and content.strip(): # Only add a subheader for meaningful section names, not raw_text if section != "raw_text": st.markdown(f"##### {section.replace('_', ' ').title()}") # Format and display content formatted_content = format_ocr_text(content, for_display=True) st.markdown(formatted_content) displayed_sections.add(section) # Then display any remaining sections not already shown for section, content in selected_result['ocr_contents'].items(): if (section not in displayed_sections and section not in ['error', 'partial_text'] and content): st.markdown(f"##### {section.replace('_', ' ').title()}") if isinstance(content, str): st.markdown(format_ocr_text(content, for_display=True)) elif isinstance(content, list): for item in content: st.markdown(f"- {item}") elif isinstance(content, dict): for k, v in content.items(): st.markdown(f"**{k}:** {v}") # Second tab - Raw JSON (simplified) with view_tab2: # Extract the relevant JSON data json_data = {} # Include important metadata for field in ['file_name', 'timestamp', 'processing_time', 'languages', 'topics', 'subjects', 'detected_document_type', 'text']: if field in selected_result: json_data[field] = selected_result[field] # Include OCR contents if 'ocr_contents' in selected_result: json_data['ocr_contents'] = selected_result['ocr_contents'] # Exclude large binary data like base64 images to keep JSON clean if 'pages_data' in selected_result: # Create simplified pages_data without large binary content simplified_pages = [] for page in selected_result['pages_data']: simplified_page = { 'page_number': page.get('page_number', 0), 'has_text': bool(page.get('markdown', '')), 'has_images': bool(page.get('images', [])), 'image_count': len(page.get('images', [])) } simplified_pages.append(simplified_page) json_data['pages_summary'] = simplified_pages # Format the JSON prettily json_str = json.dumps(json_data, indent=2) # Display in a monospace font with syntax highlighting st.code(json_str, language="json") # Third tab - Images (simplified) if has_images and view_tab3 is not None: with view_tab3: # Simplified image display if 'pages_data' in selected_result: for i, page_data in enumerate(selected_result['pages_data']): # Display each page if 'images' in page_data and len(page_data['images']) > 0: for img in page_data['images']: if 'image_base64' in img: st.image(img['image_base64'], use_container_width=True) # Get page text if available page_text = "" if 'markdown' in page_data: page_text = page_data['markdown'] # Display text if available if page_text: with st.expander(f"Page {i+1} Text", expanded=False): st.text(page_text) def display_about_tab(): """Display learn more tab content""" st.header("Learn More") # Add app description st.markdown(""" **Historical OCR** is a tailored academic tool for extracting text from historical documents, manuscripts, and printed materials. """) # Purpose section with consistent formatting st.markdown("### Purpose") st.markdown(""" This tool is designed to assist scholars in historical research by extracting text from challenging documents. While it may not achieve full accuracy for all materials, it serves as a tailored research aid for navigating historical documents, particularly: """) st.markdown(""" - **Historical newspapers** with complex layouts and aged text - **Handwritten documents** from various time periods - **Photos of archival materials** that may be difficult to read """) # Features section with consistent formatting st.markdown("### Features") st.markdown(""" - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more - **Editable Results**: Review and edit extracted text directly in the interface - **Structured Content Analysis**: Automatic organization of document content - **Multi-language Support**: Process documents in various languages - **PDF Processing**: Handle multi-page historical documents """) # How to Use section with consistent formatting st.markdown("### How to Use") st.markdown(""" 1. Upload a document (PDF or image) 2. Select the document type and adjust preprocessing options if needed 3. Add custom processing instructions for specialized documents 4. Process the document 5. Review, edit, and download the results """) # Technologies section with consistent formatting st.markdown("### Technologies") st.markdown(""" - OCR processing using Mistral AI's advanced document understanding capabilities - Image preprocessing with OpenCV - PDF handling with pdf2image - Web interface with Streamlit """) # Add version information st.markdown("**Version:** 1.0.0")