historical-ocr / ui_components.py
milwright's picture
Apply light preprocessing by default (grayscale, denoise, contrast=5) to improve OCR readability
1bd70fe
raw
history blame
28.4 kB
import streamlit as st
import os
import io
import base64
import logging
import re
from datetime import datetime
from pathlib import Path
import json
# Define exports
__all__ = [
'ProgressReporter',
'create_sidebar_options',
'create_file_uploader',
'display_document_with_images',
'display_previous_results',
'display_about_tab',
'display_results' # Re-export from utils.ui_utils
]
from constants import (
DOCUMENT_TYPES,
DOCUMENT_LAYOUTS,
CUSTOM_PROMPT_TEMPLATES,
LAYOUT_PROMPT_ADDITIONS,
DEFAULT_PDF_DPI,
MIN_PDF_DPI,
MAX_PDF_DPI,
DEFAULT_MAX_PAGES,
PERFORMANCE_MODES,
PREPROCESSING_DOC_TYPES,
ROTATION_OPTIONS
)
from utils.text_utils import format_ocr_text, clean_raw_text, format_markdown_text # Import from text_utils
from utils.content_utils import (
classify_document_content,
extract_document_text,
extract_image_description
)
from utils.ui_utils import display_results
from preprocessing import preprocess_image
class ProgressReporter:
"""Class to handle progress reporting in the UI"""
def __init__(self, placeholder):
self.placeholder = placeholder
self.progress_bar = None
self.status_text = None
def setup(self):
"""Setup the progress components"""
with self.placeholder.container():
self.progress_bar = st.progress(0)
self.status_text = st.empty()
return self
def update(self, percent, status_text):
"""Update the progress bar and status text"""
if self.progress_bar is not None:
self.progress_bar.progress(percent / 100)
if self.status_text is not None:
self.status_text.text(status_text)
def complete(self, success=True):
"""Complete the progress reporting"""
if success:
if self.progress_bar is not None:
self.progress_bar.progress(100)
if self.status_text is not None:
self.status_text.text("Processing complete!")
else:
if self.status_text is not None:
self.status_text.text("Processing failed.")
# Clear the progress components after a delay
import time
time.sleep(0.8) # Short delay to show completion
if self.progress_bar is not None:
self.progress_bar.empty()
if self.status_text is not None:
self.status_text.empty()
def create_sidebar_options():
"""Create and return sidebar options"""
with st.sidebar:
st.markdown("## OCR Settings")
# Create a container for the sidebar options
with st.container():
# Default to using vision model (removed selection from UI)
use_vision = True
# Document type selection
doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
help="Select the type of document you're processing for better results")
# Document layout
doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS,
help="Select the layout of your document")
# Initialize preprocessing variables with default values
grayscale = False
denoise = False
contrast = 0
rotation = 0
use_segmentation = False
# Custom prompt
custom_prompt = ""
# Get the template for the selected document type if not auto-detect
if doc_type != DOCUMENT_TYPES[0]:
prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "")
# Add layout information if not standard
if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout
layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "")
if layout_addition:
prompt_template += " " + layout_addition
# Set the custom prompt
custom_prompt = prompt_template
# Allow user to edit the prompt (always visible)
custom_prompt = st.text_area("Custom Processing Instructions", value=custom_prompt,
help="Customize the instructions for processing this document",
height=80)
# Image preprocessing options (always visible)
st.markdown("### Image Preprocessing")
# Grayscale conversion
grayscale = st.checkbox("Convert to Grayscale",
value=True,
help="Convert color images to grayscale for better text recognition")
# Light denoising option
denoise = st.checkbox("Light Denoising",
value=True,
help="Apply gentle denoising to improve text clarity")
# Contrast adjustment
contrast = st.slider("Contrast Adjustment",
min_value=-20,
max_value=20,
value=5,
step=5,
help="Adjust image contrast (limited range)")
# Initialize rotation (keeping it set to 0)
rotation = 0
use_segmentation = False
# Create preprocessing options dictionary
# Map UI document types to preprocessing document types
doc_type_for_preprocessing = "standard"
if "Handwritten" in doc_type:
doc_type_for_preprocessing = "handwritten"
elif "Newspaper" in doc_type or "Magazine" in doc_type:
doc_type_for_preprocessing = "newspaper"
elif "Book" in doc_type or "Publication" in doc_type:
doc_type_for_preprocessing = "book" # Match the actual preprocessing type
preprocessing_options = {
"document_type": doc_type_for_preprocessing,
"grayscale": grayscale,
"denoise": denoise,
"contrast": contrast,
"rotation": rotation
}
# PDF-specific options
st.markdown("### PDF Options")
max_pages = st.number_input("Maximum Pages to Process",
min_value=1,
max_value=20,
value=DEFAULT_MAX_PAGES,
help="Limit the number of pages to process (for multi-page PDFs)")
# Set default values for removed options
pdf_dpi = DEFAULT_PDF_DPI
pdf_rotation = 0
# Create options dictionary
options = {
"use_vision": use_vision,
"perf_mode": "Quality", # Default to Quality, removed performance mode option
"pdf_dpi": pdf_dpi,
"max_pages": max_pages,
"pdf_rotation": pdf_rotation,
"custom_prompt": custom_prompt,
"preprocessing_options": preprocessing_options,
"use_segmentation": use_segmentation if 'use_segmentation' in locals() else False
}
return options
def create_file_uploader():
"""Create and return a file uploader"""
# Add app description
st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><div style="font-size: 32px;">📜</div><div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical OCR</h2></div></div>', unsafe_allow_html=True)
st.markdown("<p style='font-size: 0.8em; color: #666; text-align: left;'>Made possible by Mistral AI</p>", unsafe_allow_html=True)
# Add project framing
st.markdown("""
This tool assists scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy, it helps navigate:
- **Historical newspapers** with complex layouts
- **Handwritten documents** from various periods
- **Photos of archival materials**
Upload a document to begin, or explore the examples.
""")
# Create file uploader with a more concise label
uploaded_file = st.file_uploader(
"Select file",
type=["pdf", "png", "jpg"],
help="Upload a PDF or image file for OCR processing"
)
return uploaded_file
def display_document_with_images(result):
"""Display document with images"""
# Check for pages_data first
if 'pages_data' in result and result['pages_data']:
pages_data = result['pages_data']
# If pages_data not available, try to extract from raw_response_data
elif 'raw_response_data' in result and isinstance(result['raw_response_data'], dict) and 'pages' in result['raw_response_data']:
# Build pages_data from raw_response_data
pages_data = []
raw_pages = result['raw_response_data']['pages']
for page_idx, page in enumerate(raw_pages):
if not isinstance(page, dict):
continue
page_data = {
'page_number': page_idx + 1,
'markdown': page.get('markdown', ''),
'images': []
}
# Extract images if present
if 'images' in page and isinstance(page['images'], list):
for img_idx, img in enumerate(page['images']):
if isinstance(img, dict) and ('base64' in img or 'image_base64' in img):
img_base64 = img.get('image_base64', img.get('base64', ''))
if img_base64:
page_data['images'].append({
'id': img.get('id', f"img_{page_idx}_{img_idx}"),
'image_base64': img_base64
})
if page_data['markdown'] or page_data['images']:
pages_data.append(page_data)
else:
st.info("No image data available.")
return
# Display each page
for i, page_data in enumerate(pages_data):
st.markdown(f"### Page {i+1}")
# Display only the image (removed text column)
# Display the image - check multiple possible field names
image_displayed = False
# Try 'image_data' field first
if 'image_data' in page_data:
try:
# Convert base64 to image
image_data = base64.b64decode(page_data['image_data'])
st.image(io.BytesIO(image_data), use_container_width=True)
image_displayed = True
except Exception as e:
st.error(f"Error displaying image from image_data: {str(e)}")
# Try 'images' array if image_data didn't work
if not image_displayed and 'images' in page_data and len(page_data['images']) > 0:
for img in page_data['images']:
if 'image_base64' in img:
try:
st.image(img['image_base64'], use_container_width=True)
image_displayed = True
break
except Exception as e:
st.error(f"Error displaying image from images array: {str(e)}")
# Try alternative image source if still not displayed
if not image_displayed and 'raw_response_data' in result:
raw_data = result['raw_response_data']
if isinstance(raw_data, dict) and 'pages' in raw_data:
for raw_page in raw_data['pages']:
if isinstance(raw_page, dict) and 'images' in raw_page:
for img in raw_page['images']:
if isinstance(img, dict) and 'base64' in img:
st.image(img['base64'], use_container_width=True)
st.caption("Image from OCR response")
image_displayed = True
break
if image_displayed:
break
if not image_displayed:
st.info("No image available for this page.")
# Extract and display alt text if available
page_text = ""
if 'text' in page_data:
page_text = page_data['text']
elif 'markdown' in page_data:
page_text = page_data['markdown']
if page_text and page_text.startswith("![") and page_text.endswith(")"):
try:
alt_text = page_text[2:page_text.index(']')]
if alt_text and len(alt_text) > 5: # Only show if alt text is meaningful
st.caption(f"Image description: {alt_text}")
except:
pass
def display_previous_results():
"""Display previous results tab content in a simplified, structured view"""
# Use a simple header without the button column
st.header("Previous Results")
# Display previous results if available
if not st.session_state.previous_results:
st.markdown("""
<div style="text-align: center; padding: 30px 20px; background-color: #f8f9fa; border-radius: 6px; margin-top: 10px;">
<div style="font-size: 36px; margin-bottom: 15px;">📄</div>
<h3="margin-bottom: 16px; font-weight: 500;">No Previous Results</h3>
<p style="font-size: 14px; color: #666;">Process a document to see your results history.</p>
</div>
""", unsafe_allow_html=True)
else:
# Prepare zip download outside of the UI flow
try:
# Create download button for all results
from utils.image_utils import create_results_zip_in_memory
zip_data = create_results_zip_in_memory(st.session_state.previous_results)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Simplified filename
zip_filename = f"ocr_results_{timestamp}.zip"
# Encode the zip data for direct download link
zip_b64 = base64.b64encode(zip_data).decode()
# Add styled download tag in the metadata section
download_html = '<div style="display: flex; align-items: center; margin: 0.5rem 0; flex-wrap: wrap;">'
download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>'
download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">All Results</a>'
download_html += '</div>'
st.markdown(download_html, unsafe_allow_html=True)
except Exception:
# Silent fail - no error message to keep UI clean
pass
# Create a cleaner, more minimal grid for results using Streamlit columns
# Calculate number of columns based on screen width - more responsive
num_columns = 2 # Two columns for most screens
# Create rows of result cards
for i in range(0, len(st.session_state.previous_results), num_columns):
# Create a row of columns
cols = st.columns(num_columns)
# Fill each column with a result card
for j in range(num_columns):
index = i + j
if index < len(st.session_state.previous_results):
result = st.session_state.previous_results[index]
# Get basic info for the card
file_name = result.get("file_name", f"Document {index+1}")
timestamp = result.get("timestamp", "")
# Determine file type icon
if file_name.lower().endswith(".pdf"):
icon = "📄"
elif any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif"]):
icon = "🖼️"
else:
icon = "📝"
# Display a simplified card in each column
with cols[j]:
# Use a container for better styling control
with st.container():
# Create visually cleaner card with less vertical space
st.markdown(f"""
<div style="padding: 10px; border: 1px solid #e0e0e0; border-radius: 6px; margin-bottom: 10px;">
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 5px;">
<div style="font-weight: 500; font-size: 14px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">{icon} {file_name}</div>
<div style="color: #666; font-size: 12px;">{timestamp.split()[0] if timestamp else ""}</div>
</div>
</div>
""", unsafe_allow_html=True)
# Add a simple button below each card
if st.button(f"View", key=f"view_{index}", help=f"View {file_name}"):
st.session_state.selected_previous_result = st.session_state.previous_results[index]
st.rerun()
# Display the selected result if available
if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
selected_result = st.session_state.selected_previous_result
# Draw a separator between results list and selected document
st.markdown("<hr style='margin: 20px 0 15px 0; border: none; height: 1px; background-color: #eee;'>", unsafe_allow_html=True)
# Create a cleaner header for the selected document
file_name = selected_result.get('file_name', 'Document')
st.subheader(f"{file_name}")
# Add a simple back button at the top
if st.button("← Back to Results", key="back_to_results"):
if 'selected_previous_result' in st.session_state:
del st.session_state.selected_previous_result
st.session_state.perform_reset = True
st.rerun()
# Simplified metadata display - just one line with essential info
meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 12px; margin: 8px 0 15px 0; font-size: 14px; color: #666;">'
# Add timestamp
if 'timestamp' in selected_result:
meta_html += f'<div>{selected_result["timestamp"]}</div>'
# Add languages if available (simplified)
if 'languages' in selected_result and selected_result['languages']:
languages = [lang for lang in selected_result['languages'] if lang is not None]
if languages:
meta_html += f'<div>Language: {", ".join(languages)}</div>'
# Add page count if available (simplified)
if 'limited_pages' in selected_result:
meta_html += f'<div>Pages: {selected_result["limited_pages"]["processed"]}/{selected_result["limited_pages"]["total"]}</div>'
meta_html += '</div>'
st.markdown(meta_html, unsafe_allow_html=True)
# Simplified tabs - using the same format as main view
has_images = selected_result.get('has_images', False)
if has_images:
view_tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
view_tab1, view_tab2, view_tab3 = view_tabs
else:
view_tabs = st.tabs(["Document Content", "Raw JSON"])
view_tab1, view_tab2 = view_tabs
view_tab3 = None
# First tab - Document Content (simplified structured view)
with view_tab1:
# Display content in a cleaner, more streamlined format
if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
# Create a more focused list of important sections
priority_sections = ["title", "content", "transcript", "summary"]
displayed_sections = set()
# First display priority sections
for section in priority_sections:
if section in selected_result['ocr_contents'] and selected_result['ocr_contents'][section]:
content = selected_result['ocr_contents'][section]
if isinstance(content, str) and content.strip():
# Only add a subheader for meaningful section names, not raw_text
if section != "raw_text":
st.markdown(f"##### {section.replace('_', ' ').title()}")
# Format and display content
formatted_content = format_ocr_text(content, for_display=True)
st.markdown(formatted_content)
displayed_sections.add(section)
# Then display any remaining sections not already shown
for section, content in selected_result['ocr_contents'].items():
if (section not in displayed_sections and
section not in ['error', 'partial_text'] and
content):
st.markdown(f"##### {section.replace('_', ' ').title()}")
if isinstance(content, str):
st.markdown(format_ocr_text(content, for_display=True))
elif isinstance(content, list):
for item in content:
st.markdown(f"- {item}")
elif isinstance(content, dict):
for k, v in content.items():
st.markdown(f"**{k}:** {v}")
# Second tab - Raw JSON (simplified)
with view_tab2:
# Extract the relevant JSON data
json_data = {}
# Include important metadata
for field in ['file_name', 'timestamp', 'processing_time', 'languages', 'topics', 'subjects', 'detected_document_type', 'text']:
if field in selected_result:
json_data[field] = selected_result[field]
# Include OCR contents
if 'ocr_contents' in selected_result:
json_data['ocr_contents'] = selected_result['ocr_contents']
# Exclude large binary data like base64 images to keep JSON clean
if 'pages_data' in selected_result:
# Create simplified pages_data without large binary content
simplified_pages = []
for page in selected_result['pages_data']:
simplified_page = {
'page_number': page.get('page_number', 0),
'has_text': bool(page.get('markdown', '')),
'has_images': bool(page.get('images', [])),
'image_count': len(page.get('images', []))
}
simplified_pages.append(simplified_page)
json_data['pages_summary'] = simplified_pages
# Format the JSON prettily
json_str = json.dumps(json_data, indent=2)
# Display in a monospace font with syntax highlighting
st.code(json_str, language="json")
# Third tab - Images (simplified)
if has_images and view_tab3 is not None:
with view_tab3:
# Simplified image display
if 'pages_data' in selected_result:
for i, page_data in enumerate(selected_result['pages_data']):
# Display each page
if 'images' in page_data and len(page_data['images']) > 0:
for img in page_data['images']:
if 'image_base64' in img:
st.image(img['image_base64'], use_container_width=True)
# Get page text if available
page_text = ""
if 'markdown' in page_data:
page_text = page_data['markdown']
# Display text if available
if page_text:
with st.expander(f"Page {i+1} Text", expanded=False):
st.text(page_text)
def display_about_tab():
"""Display learn more tab content"""
st.header("Learn More")
# Add app description
st.markdown("""
**Historical OCR** is a tailored academic tool for extracting text from historical documents, manuscripts, and printed materials.
""")
# Purpose section with consistent formatting
st.markdown("### Purpose")
st.markdown("""
This tool is designed to assist scholars in historical research by extracting text from challenging documents.
While it may not achieve full accuracy for all materials, it serves as a tailored research aid for navigating
historical documents, particularly:
""")
st.markdown("""
- **Historical newspapers** with complex layouts and aged text
- **Handwritten documents** from various time periods
- **Photos of archival materials** that may be difficult to read
""")
# Features section with consistent formatting
st.markdown("### Features")
st.markdown("""
- **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
- **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
- **Editable Results**: Review and edit extracted text directly in the interface
- **Structured Content Analysis**: Automatic organization of document content
- **Multi-language Support**: Process documents in various languages
- **PDF Processing**: Handle multi-page historical documents
""")
# How to Use section with consistent formatting
st.markdown("### How to Use")
st.markdown("""
1. Upload a document (PDF or image)
2. Select the document type and adjust preprocessing options if needed
3. Add custom processing instructions for specialized documents
4. Process the document
5. Review, edit, and download the results
""")
# Technologies section with consistent formatting
st.markdown("### Technologies")
st.markdown("""
- OCR processing using Mistral AI's advanced document understanding capabilities
- Image preprocessing with OpenCV
- PDF handling with pdf2image
- Web interface with Streamlit
""")
# Add version information
st.markdown("**Version:** 1.0.0")