Spaces:
Running
Running
import streamlit as st | |
import os | |
import io | |
import base64 | |
from datetime import datetime | |
from pathlib import Path | |
import json | |
from constants import ( | |
DOCUMENT_TYPES, | |
DOCUMENT_LAYOUTS, | |
CUSTOM_PROMPT_TEMPLATES, | |
LAYOUT_PROMPT_ADDITIONS, | |
DEFAULT_PDF_DPI, | |
MIN_PDF_DPI, | |
MAX_PDF_DPI, | |
DEFAULT_MAX_PAGES, | |
PERFORMANCE_MODES, | |
PREPROCESSING_DOC_TYPES, | |
ROTATION_OPTIONS | |
) | |
from utils import get_base64_from_image, extract_subject_tags | |
class ProgressReporter: | |
"""Class to handle progress reporting in the UI""" | |
def __init__(self, placeholder): | |
self.placeholder = placeholder | |
self.progress_bar = None | |
self.status_text = None | |
def setup(self): | |
"""Setup the progress components""" | |
with self.placeholder.container(): | |
self.progress_bar = st.progress(0) | |
self.status_text = st.empty() | |
return self | |
def update(self, percent, status_text): | |
"""Update the progress bar and status text""" | |
if self.progress_bar is not None: | |
self.progress_bar.progress(percent / 100) | |
if self.status_text is not None: | |
self.status_text.text(status_text) | |
def complete(self, success=True): | |
"""Complete the progress reporting""" | |
if success: | |
if self.progress_bar is not None: | |
self.progress_bar.progress(100) | |
if self.status_text is not None: | |
self.status_text.text("Processing complete!") | |
else: | |
if self.status_text is not None: | |
self.status_text.text("Processing failed.") | |
# Clear the progress components after a delay | |
import time | |
time.sleep(0.8) # Short delay to show completion | |
if self.progress_bar is not None: | |
self.progress_bar.empty() | |
if self.status_text is not None: | |
self.status_text.empty() | |
def create_sidebar_options(): | |
"""Create and return sidebar options""" | |
with st.sidebar: | |
st.title("OCR Settings") | |
# Create a container for the sidebar options | |
with st.container(): | |
# Model selection | |
st.subheader("Model Selection") | |
use_vision = st.toggle("Use Vision Model", value=True, help="Use vision model for better understanding of document structure") | |
# Document type selection | |
st.subheader("Document Type") | |
doc_type = st.selectbox("Document Type", DOCUMENT_TYPES, | |
help="Select the type of document you're processing for better results") | |
# Document layout | |
doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS, | |
help="Select the layout of your document") | |
# Custom prompt | |
custom_prompt = "" | |
if doc_type != DOCUMENT_TYPES[0]: # Not auto-detect | |
# Get the template for the selected document type | |
prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "") | |
# Add layout information if not standard | |
if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout | |
layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "") | |
if layout_addition: | |
prompt_template += " " + layout_addition | |
# Set the custom prompt | |
custom_prompt = prompt_template | |
# Allow user to edit the prompt | |
st.markdown("**Custom Processing Instructions**") | |
custom_prompt = st.text_area("", value=custom_prompt, | |
help="Customize the instructions for processing this document", | |
height=100) | |
# Image preprocessing options in an expandable section | |
with st.expander("Image Preprocessing"): | |
# Grayscale conversion | |
grayscale = st.checkbox("Convert to Grayscale", | |
value=False, | |
help="Convert color images to grayscale for better OCR") | |
# Denoise | |
denoise = st.checkbox("Denoise Image", | |
value=False, | |
help="Remove noise from the image") | |
# Contrast adjustment | |
contrast = st.slider("Contrast Adjustment", | |
min_value=-50, | |
max_value=50, | |
value=0, | |
step=10, | |
help="Adjust image contrast") | |
# Rotation | |
rotation = st.slider("Rotation", | |
min_value=-45, | |
max_value=45, | |
value=0, | |
step=5, | |
help="Rotate image if needed") | |
# Create preprocessing options dictionary | |
preprocessing_options = { | |
"document_type": "standard", # Use standard as default, removed duplicate option | |
"grayscale": grayscale, | |
"denoise": denoise, | |
"contrast": contrast, | |
"rotation": rotation | |
} | |
# PDF-specific options in an expandable section | |
with st.expander("PDF Options"): | |
pdf_dpi = st.slider("PDF Resolution (DPI)", | |
min_value=MIN_PDF_DPI, | |
max_value=MAX_PDF_DPI, | |
value=DEFAULT_PDF_DPI, | |
step=25, | |
help="Higher DPI gives better quality but slower processing") | |
max_pages = st.number_input("Maximum Pages to Process", | |
min_value=1, | |
max_value=20, | |
value=DEFAULT_MAX_PAGES, | |
help="Limit the number of pages to process (for multi-page PDFs)") | |
pdf_rotation = st.radio("PDF Rotation", ROTATION_OPTIONS, | |
horizontal=True, | |
format_func=lambda x: f"{x}°", | |
help="Rotate PDF pages if needed") | |
# Create options dictionary | |
options = { | |
"use_vision": use_vision, | |
"perf_mode": "Quality", # Default to Quality, removed performance mode option | |
"pdf_dpi": pdf_dpi, | |
"max_pages": max_pages, | |
"pdf_rotation": pdf_rotation, | |
"custom_prompt": custom_prompt, | |
"preprocessing_options": preprocessing_options | |
} | |
return options | |
def create_file_uploader(): | |
"""Create and return a file uploader""" | |
# Add app description | |
favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png") | |
favicon_base64 = get_base64_from_image(favicon_path) | |
st.markdown(f'<div style="display: flex; align-items: center; gap: 10px;"><img src="data:image/png;base64,{favicon_base64}" width="36" height="36" alt="Scroll Icon"/> <div><h2 style="margin: 0; padding: 10px 0 0 0;">Historical Document OCR</h2></div></div>', unsafe_allow_html=True) | |
st.markdown("<p style='font-size: 0.8em; color: #666; text-align: right;'>Made possible by Mistral AI</p>", unsafe_allow_html=True) | |
# Add project framing | |
st.markdown(""" | |
This tool is designed to assist scholars in historical research by extracting text from challenging documents. | |
While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating | |
historical documents, particularly: | |
- **Historical newspapers** with complex layouts and aged text | |
- **Handwritten documents** from various time periods | |
- **Photos of archival materials** that may be difficult to read | |
Upload a document to get started, or explore the example documents. | |
""") | |
# Create file uploader | |
uploaded_file = st.file_uploader( | |
"Upload a document", | |
type=["pdf", "png", "jpg", "jpeg"], | |
help="Upload a PDF or image file for OCR processing" | |
) | |
return uploaded_file | |
def display_results(result, container, custom_prompt=""): | |
"""Display OCR results in the provided container""" | |
with container: | |
# Display document metadata | |
st.subheader("Document Metadata") | |
# Create columns for metadata | |
meta_col1, meta_col2 = st.columns(2) | |
with meta_col1: | |
# Display document type and languages | |
if 'detected_document_type' in result: | |
st.write(f"**Document Type:** {result['detected_document_type']}") | |
if 'languages' in result: | |
languages = [lang for lang in result['languages'] if lang is not None] | |
if languages: | |
st.write(f"**Languages:** {', '.join(languages)}") | |
with meta_col2: | |
# Display processing time | |
if 'processing_time' in result: | |
st.write(f"**Processing Time:** {result['processing_time']:.1f}s") | |
# Display page information for PDFs | |
if 'limited_pages' in result: | |
st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages") | |
# Display subject tags if available | |
if 'topics' in result and result['topics']: | |
st.write("**Subject Tags:**") | |
# Create a container with flex display for the tags | |
st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True) | |
# Generate a badge for each tag | |
for topic in result['topics']: | |
# Create colored badge based on tag category | |
badge_color = "#546e7a" # Default color | |
# Assign colors by category | |
if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]): | |
badge_color = "#1565c0" # Blue for time periods | |
elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]): | |
badge_color = "#00695c" # Teal for languages | |
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]): | |
badge_color = "#6a1b9a" # Purple for document types | |
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]): | |
badge_color = "#2e7d32" # Green for subject domains | |
elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]): | |
badge_color = "#e65100" # Orange for preprocessing-related tags | |
st.markdown( | |
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; ' | |
f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>', | |
unsafe_allow_html=True | |
) | |
# Close the container | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Display OCR content | |
st.subheader("OCR Content") | |
# Check if we have OCR content | |
if 'ocr_contents' in result: | |
# Create tabs for different views | |
has_images = result.get('has_images', False) | |
if has_images: | |
content_tab1, content_tab2, content_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"]) | |
else: | |
content_tab1, content_tab2 = st.tabs(["Structured View", "Raw Text"]) | |
with content_tab1: | |
# Display structured content | |
if isinstance(result['ocr_contents'], dict): | |
for section, content in result['ocr_contents'].items(): | |
if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections | |
st.markdown(f"#### {section.replace('_', ' ').title()}") | |
if isinstance(content, str): | |
st.write(content) | |
elif isinstance(content, list): | |
for item in content: | |
if isinstance(item, str): | |
st.write(f"- {item}") | |
else: | |
st.write(f"- {str(item)}") | |
elif isinstance(content, dict): | |
for k, v in content.items(): | |
st.write(f"**{k}:** {v}") | |
with content_tab2: | |
# Display raw text with editing capability | |
raw_text = "" | |
if 'raw_text' in result['ocr_contents']: | |
raw_text = result['ocr_contents']['raw_text'] | |
elif 'content' in result['ocr_contents']: | |
raw_text = result['ocr_contents']['content'] | |
# Allow editing of the raw text | |
edited_text = st.text_area("Edit Raw Text", raw_text, height=400) | |
# Add a button to copy the edited text to clipboard | |
if st.button("Copy to Clipboard"): | |
st.success("Text copied to clipboard! (You can paste it elsewhere)") | |
# Note: The actual clipboard functionality is handled by the browser | |
# Add a download button for the edited text | |
st.download_button( | |
label="Download Edited Text", | |
data=edited_text, | |
file_name=f"{result.get('file_name', 'document').split('.')[0]}_edited.txt", | |
mime="text/plain" | |
) | |
if has_images and 'pages_data' in result: | |
with content_tab3: | |
# Use the display_document_with_images function | |
display_document_with_images(result) | |
# Display custom prompt if provided | |
if custom_prompt: | |
with st.expander("Custom Processing Instructions"): | |
st.write(custom_prompt) | |
# Add download buttons | |
st.subheader("Download Results") | |
# Create columns for download buttons | |
download_col1, download_col2 = st.columns(2) | |
with download_col1: | |
# JSON download | |
try: | |
json_str = json.dumps(result, indent=2) | |
st.download_button( | |
label="Download JSON", | |
data=json_str, | |
file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.json", | |
mime="application/json" | |
) | |
except Exception as e: | |
st.error(f"Error creating JSON download: {str(e)}") | |
with download_col2: | |
# Text download | |
try: | |
if 'ocr_contents' in result: | |
if 'raw_text' in result['ocr_contents']: | |
text_content = result['ocr_contents']['raw_text'] | |
elif 'content' in result['ocr_contents']: | |
text_content = result['ocr_contents']['content'] | |
else: | |
text_content = str(result['ocr_contents']) | |
else: | |
text_content = "No text content available." | |
st.download_button( | |
label="Download Text", | |
data=text_content, | |
file_name=f"{result.get('file_name', 'document').split('.')[0]}_ocr.txt", | |
mime="text/plain" | |
) | |
except Exception as e: | |
st.error(f"Error creating text download: {str(e)}") | |
def display_document_with_images(result): | |
"""Display document with images""" | |
if 'pages_data' not in result: | |
st.info("No image data available.") | |
return | |
# Display each page | |
for i, page_data in enumerate(result['pages_data']): | |
st.markdown(f"### Page {i+1}") | |
# Create columns for image and text | |
img_col, text_col = st.columns([1, 1]) | |
with img_col: | |
# Display the image | |
if 'image_data' in page_data: | |
try: | |
# Convert base64 to image | |
image_data = base64.b64decode(page_data['image_data']) | |
st.image(io.BytesIO(image_data), use_container_width=True) | |
except Exception as e: | |
st.error(f"Error displaying image: {str(e)}") | |
else: | |
st.info("No image available for this page.") | |
with text_col: | |
# Display the text with editing capability | |
if 'text' in page_data: | |
edited_text = st.text_area(f"Page {i+1} Text", page_data['text'], height=300, key=f"page_text_{i}") | |
# Add a button to copy the edited text to clipboard | |
if st.button(f"Copy Page {i+1} Text", key=f"copy_btn_{i}"): | |
st.success(f"Page {i+1} text copied to clipboard!") | |
else: | |
st.info("No text available for this page.") | |
def display_previous_results(): | |
"""Display previous results tab content""" | |
st.markdown('<h2>Previous Results</h2>', unsafe_allow_html=True) | |
# Load custom CSS for Previous Results tab | |
try: | |
from ui.layout import load_css | |
load_css() | |
except ImportError: | |
# If ui.layout module is not available, use a simplified version | |
st.markdown(""" | |
<style> | |
.previous-results-container { | |
margin-top: 20px; | |
} | |
.result-card { | |
background-color: #f8f9fa; | |
border-radius: 8px; | |
padding: 15px; | |
margin-bottom: 15px; | |
border: 1px solid #e0e0e0; | |
} | |
.result-header { | |
display: flex; | |
justify-content: space-between; | |
margin-bottom: 10px; | |
} | |
.result-filename { | |
font-weight: bold; | |
font-size: 16px; | |
} | |
.result-date { | |
color: #666; | |
font-size: 14px; | |
} | |
.result-metadata { | |
margin-top: 10px; | |
font-size: 14px; | |
} | |
.result-tag { | |
margin-bottom: 5px; | |
color: #555; | |
} | |
.result-action-button { | |
margin-top: 10px; | |
text-align: right; | |
} | |
.selected-result-container { | |
margin-top: 30px; | |
padding: 20px; | |
background-color: #f0f2f6; | |
border-radius: 8px; | |
} | |
.selected-result-title { | |
font-size: 18px; | |
font-weight: bold; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Display previous results if available | |
if not st.session_state.previous_results: | |
st.markdown(""" | |
<div class="previous-results-container" style="text-align: center; padding: 40px 20px; background-color: #f0f2f6; border-radius: 8px;"> | |
<div style="font-size: 48px; margin-bottom: 20px;">📄</div> | |
<h3 style="margin-bottom: 10px; font-weight: 600;">No Previous Results</h3> | |
<p style="font-size: 16px;">Process a document to see your results history saved here.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
else: | |
# Create a container for the results list | |
st.markdown('<div class="previous-results-container">', unsafe_allow_html=True) | |
st.markdown(f'<h3>{len(st.session_state.previous_results)} Previous Results</h3>', unsafe_allow_html=True) | |
# Create two columns for filters and download buttons | |
filter_col, download_col = st.columns([2, 1]) | |
with filter_col: | |
# Add filter options | |
filter_options = ["All Types"] | |
if any(result.get("file_name", "").lower().endswith(".pdf") for result in st.session_state.previous_results): | |
filter_options.append("PDF Documents") | |
if any(result.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png")) for result in st.session_state.previous_results): | |
filter_options.append("Images") | |
selected_filter = st.selectbox("Filter by Type:", filter_options) | |
with download_col: | |
# Add download all button for results | |
if len(st.session_state.previous_results) > 0: | |
try: | |
# Create buffer in memory instead of file on disk | |
import io | |
from ocr_utils import create_results_zip_in_memory | |
# Get zip data directly in memory | |
zip_data = create_results_zip_in_memory(st.session_state.previous_results) | |
# Create more informative ZIP filename with timestamp | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
# Count document types for a more descriptive filename | |
pdf_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith('.pdf')) | |
img_count = sum(1 for r in st.session_state.previous_results if r.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png'))) | |
# Create more descriptive filename | |
if pdf_count > 0 and img_count > 0: | |
zip_filename = f"historical_ocr_mixed_{pdf_count}pdf_{img_count}img_{timestamp}.zip" | |
elif pdf_count > 0: | |
zip_filename = f"historical_ocr_pdf_documents_{pdf_count}_{timestamp}.zip" | |
elif img_count > 0: | |
zip_filename = f"historical_ocr_images_{img_count}_{timestamp}.zip" | |
else: | |
zip_filename = f"historical_ocr_results_{timestamp}.zip" | |
st.download_button( | |
label="Download All Results", | |
data=zip_data, | |
file_name=zip_filename, | |
mime="application/zip", | |
help="Download all previous results as a ZIP file containing HTML and JSON files" | |
) | |
except Exception as e: | |
st.error(f"Error creating download: {str(e)}") | |
st.info("Try with fewer results or individual downloads") | |
# Filter results based on selection | |
filtered_results = st.session_state.previous_results | |
if selected_filter == "PDF Documents": | |
filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith(".pdf")] | |
elif selected_filter == "Images": | |
filtered_results = [r for r in st.session_state.previous_results if r.get("file_name", "").lower().endswith((".jpg", ".jpeg", ".png"))] | |
# Show a message if no results match the filter | |
if not filtered_results: | |
st.markdown(""" | |
<div style="text-align: center; padding: 20px; background-color: #f9f9f9; border-radius: 5px; margin: 20px 0;"> | |
<p>No results match the selected filter.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Display each result as a card | |
for i, result in enumerate(filtered_results): | |
# Determine file type icon | |
file_name = result.get("file_name", f"Document {i+1}") | |
file_type_lower = file_name.lower() | |
if file_type_lower.endswith(".pdf"): | |
icon = "📄" | |
elif file_type_lower.endswith((".jpg", ".jpeg", ".png", ".gif")): | |
icon = "🖼️" | |
else: | |
icon = "📝" | |
# Create a card for each result | |
st.markdown(f""" | |
<div class="result-card"> | |
<div class="result-header"> | |
<div class="result-filename">{icon} {result.get('descriptive_file_name', file_name)}</div> | |
<div class="result-date">{result.get('timestamp', 'Unknown')}</div> | |
</div> | |
<div class="result-metadata"> | |
<div class="result-tag">Languages: {', '.join(result.get('languages', ['Unknown']))}</div> | |
<div class="result-tag">Topics: {', '.join(result.get('topics', ['Unknown'])[:5])} {' + ' + str(len(result.get('topics', [])) - 5) + ' more' if len(result.get('topics', [])) > 5 else ''}</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Add view button inside the card with proper styling | |
st.markdown('<div class="result-action-button">', unsafe_allow_html=True) | |
if st.button(f"View Document", key=f"view_{i}"): | |
# Set the selected result in the session state | |
st.session_state.selected_previous_result = st.session_state.previous_results[i] | |
# Force a rerun to show the selected result | |
st.rerun() | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Close the result card | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Close the container | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Display the selected result if available | |
if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result: | |
selected_result = st.session_state.selected_previous_result | |
# Create a styled container for the selected result | |
st.markdown(f""" | |
<div class="selected-result-container"> | |
<div class="result-header" style="margin-bottom: 20px;"> | |
<div class="selected-result-title">Selected Document: {selected_result.get('file_name', 'Unknown')}</div> | |
<div class="result-date">{selected_result.get('timestamp', '')}</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Display metadata in a styled way | |
meta_col1, meta_col2 = st.columns(2) | |
with meta_col1: | |
# Display document metadata | |
if 'languages' in selected_result: | |
languages = [lang for lang in selected_result['languages'] if lang is not None] | |
if languages: | |
st.write(f"**Languages:** {', '.join(languages)}") | |
if 'topics' in selected_result and selected_result['topics']: | |
# Show topics in a more organized way with badges | |
st.markdown("**Subject Tags:**") | |
# Create a container with flex display for the tags | |
st.markdown('<div style="display: flex; flex-wrap: wrap; gap: 5px; margin-top: 5px;">', unsafe_allow_html=True) | |
# Generate a badge for each tag | |
for topic in selected_result['topics']: | |
# Create colored badge based on tag category | |
badge_color = "#546e7a" # Default color | |
# Assign colors by category | |
if any(term in topic.lower() for term in ["century", "pre-", "era", "historical"]): | |
badge_color = "#1565c0" # Blue for time periods | |
elif any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]): | |
badge_color = "#00695c" # Teal for languages | |
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]): | |
badge_color = "#6a1b9a" # Purple for document types | |
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]): | |
badge_color = "#2e7d32" # Green for subject domains | |
elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]): | |
badge_color = "#e65100" # Orange for preprocessing-related tags | |
st.markdown( | |
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; ' | |
f'border-radius: 12px; font-size: 0.85em; display: inline-block; margin-bottom: 5px;">{topic}</span>', | |
unsafe_allow_html=True | |
) | |
# Close the container | |
st.markdown('</div>', unsafe_allow_html=True) | |
with meta_col2: | |
# Display processing metadata | |
if 'limited_pages' in selected_result: | |
st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages") | |
if 'processing_time' in selected_result: | |
proc_time = selected_result['processing_time'] | |
st.write(f"**Processing Time:** {proc_time:.1f}s") | |
# Create tabs for content display | |
has_images = selected_result.get('has_images', False) | |
if has_images: | |
view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw Text", "With Images"]) | |
else: | |
view_tab1, view_tab2 = st.tabs(["Structured View", "Raw Text"]) | |
with view_tab1: | |
# Display structured content | |
if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict): | |
for section, content in selected_result['ocr_contents'].items(): | |
if content and section not in ['error', 'raw_text', 'partial_text']: # Skip error and raw text sections | |
st.markdown(f"#### {section.replace('_', ' ').title()}") | |
if isinstance(content, str): | |
st.write(content) | |
elif isinstance(content, list): | |
for item in content: | |
if isinstance(item, str): | |
st.write(f"- {item}") | |
else: | |
st.write(f"- {str(item)}") | |
elif isinstance(content, dict): | |
for k, v in content.items(): | |
st.write(f"**{k}:** {v}") | |
with view_tab2: | |
# Display raw text with editing capability | |
raw_text = "" | |
if 'ocr_contents' in selected_result: | |
if 'raw_text' in selected_result['ocr_contents']: | |
raw_text = selected_result['ocr_contents']['raw_text'] | |
elif 'content' in selected_result['ocr_contents']: | |
raw_text = selected_result['ocr_contents']['content'] | |
# Allow editing of the raw text | |
edited_text = st.text_area("Edit Raw Text", raw_text, height=400, key="selected_raw_text") | |
# Add a button to copy the edited text to clipboard | |
if st.button("Copy to Clipboard", key="selected_copy_btn"): | |
st.success("Text copied to clipboard! (You can paste it elsewhere)") | |
# Add a download button for the edited text | |
st.download_button( | |
label="Download Edited Text", | |
data=edited_text, | |
file_name=f"{selected_result.get('file_name', 'document').split('.')[0]}_edited.txt", | |
mime="text/plain", | |
key="selected_download_btn" | |
) | |
if has_images and 'pages_data' in selected_result: | |
with view_tab3: | |
# Use the display_document_with_images function | |
display_document_with_images(selected_result) | |
# Close the container | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Add a button to close the selected result | |
if st.button("Close Selected Document", key="close_selected"): | |
# Clear the selected result from session state | |
del st.session_state.selected_previous_result | |
# Force a rerun to update the view | |
st.rerun() | |
def display_about_tab(): | |
"""Display about tab content""" | |
st.markdown('<h2>About Historical OCR</h2>', unsafe_allow_html=True) | |
# Add app description | |
st.markdown(""" | |
**Historical OCR** is a specialized tool for extracting text from historical documents, manuscripts, and printed materials. | |
### Purpose | |
This tool is designed to assist scholars in historical research by extracting text from challenging documents. | |
While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating | |
historical documents, particularly: | |
- **Historical newspapers** with complex layouts and aged text | |
- **Handwritten documents** from various time periods | |
- **Photos of archival materials** that may be difficult to read | |
### Features | |
- **Advanced Image Preprocessing**: Optimize historical documents for better OCR results | |
- **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more | |
- **Editable Results**: Review and edit extracted text directly in the interface | |
- **Structured Content Analysis**: Automatic organization of document content | |
- **Multi-language Support**: Process documents in various languages | |
- **PDF Processing**: Handle multi-page historical documents | |
### How to Use | |
1. Upload a document (PDF or image) | |
2. Select the document type and adjust preprocessing options if needed | |
3. Add custom processing instructions for specialized documents | |
4. Process the document | |
5. Review, edit, and download the results | |
### Technologies | |
- OCR processing using Mistral AI's advanced document understanding capabilities | |
- Image preprocessing with OpenCV | |
- PDF handling with pdf2image | |
- Web interface with Streamlit | |
""") | |
# Add version information | |
st.markdown("**Version:** 1.0.0") | |