"""
UI utilities for OCR results display.
"""
import os
import streamlit as st
import json
import base64
import io
from datetime import datetime
from utils.text_utils import format_ocr_text
from utils.content_utils import classify_document_content, format_structured_data
def display_results(result, container, custom_prompt=""):
"""Display OCR results in the provided container"""
with container:
# Add heading for document metadata
st.markdown("### Document Metadata")
# Filter out large data structures from metadata display
meta = {k: v for k, v in result.items()
if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']}
# Create a compact metadata section for primary metadata
meta_html = '
'
# Document type
if 'detected_document_type' in meta:
meta_html += f'
Type: {meta["detected_document_type"]}
'
# Page information
if 'limited_pages' in meta:
meta_html += f'
Pages: {meta["limited_pages"]["processed"]}/{meta["limited_pages"]["total"]}
'
meta_html += '
'
st.markdown(meta_html, unsafe_allow_html=True)
# Processing time - separate section for proper ordering of all metadata fields
if 'processing_time' in meta:
time_html = ''
time_html += '
Time:
'
time_html += f'
{meta["processing_time"]:.1f}s
'
time_html += '
'
st.markdown(time_html, unsafe_allow_html=True)
# Language metadata on a separate line, Subject Tags below
# First show languages if available
if 'languages' in result and result['languages']:
languages = [lang for lang in result['languages'] if lang is not None]
if languages:
# Create a dedicated line for Languages
lang_html = ''
lang_html += '
Language:
'
# Add language tags
for lang in languages:
# Clean language name if needed
clean_lang = str(lang).strip()
if clean_lang: # Only add if not empty
lang_html += f'
{clean_lang}'
lang_html += '
'
st.markdown(lang_html, unsafe_allow_html=True)
# Prepare download files
try:
# Get base filename
from utils.general_utils import create_descriptive_filename
original_file = result.get('file_name', 'document')
base_name = create_descriptive_filename(original_file, result, "")
base_name = os.path.splitext(base_name)[0]
# 1. JSON download - with base64 data truncated for readability
from utils.image_utils import truncate_base64_in_result
truncated_result = truncate_base64_in_result(result)
json_str = json.dumps(truncated_result, indent=2)
json_filename = f"{base_name}.json"
json_b64 = base64.b64encode(json_str.encode()).decode()
# 2. Create ZIP with all files
from utils.image_utils import create_results_zip_in_memory
zip_data = create_results_zip_in_memory(result)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"{base_name}_{timestamp}.zip"
zip_b64 = base64.b64encode(zip_data).decode()
# Add download line with metadata styling
download_html = ''
download_html += '
Download:
'
# Download links in order of importance, matching the zip file contents
download_html += f'
JSON'
# Zip download link (packages everything together)
download_html += f'
Zip Archive'
download_html += '
'
st.markdown(download_html, unsafe_allow_html=True)
except Exception as e:
# Silent fail for downloads - don't disrupt the UI
pass
# Create a separate line for Time if we have time-related tags
if 'topics' in result and result['topics']:
time_tags = [topic for topic in result['topics']
if any(term in topic.lower() for term in ["century", "pre-", "era"])]
if time_tags:
time_html = ''
time_html += '
Time:
'
for tag in time_tags:
time_html += f'
{tag}'
time_html += '
'
st.markdown(time_html, unsafe_allow_html=True)
# Then display remaining subject tags if available
if 'topics' in result and result['topics']:
# Filter out time-related tags which are already displayed
subject_tags = [topic for topic in result['topics']
if not any(term in topic.lower() for term in ["century", "pre-", "era"])]
if subject_tags:
# Create a separate line for Subject Tags
tags_html = ''
tags_html += '
Subject Tags:
'
tags_html += '
'
# Generate a badge for each remaining tag
for topic in subject_tags:
# Determine tag category class
tag_class = "subject-tag" # Default class
# Add specialized class based on category
if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
tag_class += " tag-language" # Languages
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
tag_class += " tag-document-type" # Document types
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
tag_class += " tag-subject" # Subject domains
elif "historical" in topic.lower() and "document" in topic.lower():
tag_class += " tag-document-type" # "Historical Document Analysis" should be a document type
# Add each tag as an inline span
tags_html += f'{topic}'
# Close the containers
tags_html += '
'
# Render the subject tags section
st.markdown(tags_html, unsafe_allow_html=True)
# Check if we have OCR content
if 'ocr_contents' in result:
# Create a single view instead of tabs
content_tab1 = st.container()
# Check for images in the result to use later
has_images = result.get('has_images', False)
has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and
any('images' in page for page in result['raw_response_data']['pages']
if isinstance(page, dict)))
# Display structured content
with content_tab1:
# Display structured content with markdown formatting
if isinstance(result['ocr_contents'], dict):
# CSS is now handled in the main layout.py file
# Collect all available images from the result
available_images = []
if has_images and 'pages_data' in result:
for page_idx, page in enumerate(result['pages_data']):
if 'images' in page and len(page['images']) > 0:
for img_idx, img in enumerate(page['images']):
if 'image_base64' in img:
available_images.append({
'source': 'pages_data',
'page': page_idx,
'index': img_idx,
'data': img['image_base64']
})
# Get images from raw response as well
if 'raw_response_data' in result:
raw_data = result['raw_response_data']
if isinstance(raw_data, dict) and 'pages' in raw_data:
for page_idx, page in enumerate(raw_data['pages']):
if isinstance(page, dict) and 'images' in page:
for img_idx, img in enumerate(page['images']):
if isinstance(img, dict) and 'base64' in img:
available_images.append({
'source': 'raw_response',
'page': page_idx,
'index': img_idx,
'data': img['base64']
})
# Extract images for display at the top
images_to_display = []
# First, collect all available images
for img_idx, img in enumerate(available_images):
if 'data' in img:
images_to_display.append({
'data': img['data'],
'id': img.get('id', f"img_{img_idx}"),
'index': img_idx
})
# Image display now only happens in the Images tab
# Organize sections in a logical order - prioritize main_text
section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"]
ordered_sections = []
# Add known sections first in preferred order
for section_name in section_order:
if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
ordered_sections.append(section_name)
# Add any remaining sections
for section in result['ocr_contents'].keys():
if (section not in ordered_sections and
section not in ['error', 'partial_text'] and
result['ocr_contents'][section]):
ordered_sections.append(section)
# If only raw_text is available and no other content, add it last
if ('raw_text' in result['ocr_contents'] and
result['ocr_contents']['raw_text'] and
len(ordered_sections) == 0):
ordered_sections.append('raw_text')
# Add minimal spacing before OCR results
st.markdown("", unsafe_allow_html=True)
# Create tabs for different views
if has_images:
tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
doc_tab, json_tab, img_tab = tabs
else:
tabs = st.tabs(["Document Content", "Raw JSON"])
doc_tab, json_tab = tabs
img_tab = None
# Document Content tab with simple, clean formatting that matches markdown export files
with doc_tab:
# Create a single unified content section
st.markdown("## Text Content")
# Present content directly in the format used in markdown export files
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict):
# Get all content fields that should be displayed
content_fields = {}
# Add all available content fields (left_page, right_page, etc)
for field, content in result['ocr_contents'].items():
# Skip certain fields that shouldn't be displayed
if field in ['error', 'partial_text'] or not content:
continue
# Clean the content if it's a string
if isinstance(content, str) and content.strip():
content_fields[field] = content.strip()
# Handle dictionary or list content
elif isinstance(content, (dict, list)):
formatted_content = format_structured_data(content)
if formatted_content:
content_fields[field] = formatted_content
# Process nested dictionary structures
def flatten_content_fields(fields, parent_key=""):
flat_fields = {}
for field, content in fields.items():
# Skip certain fields
if field in ['error', 'partial_text'] or not content:
continue
# Handle string content
if isinstance(content, str) and content.strip():
key = f"{parent_key}_{field}".strip("_")
flat_fields[key] = content.strip()
# Handle dictionary content
elif isinstance(content, dict):
# If the dictionary has a 'text' key, extract just that value
if 'text' in content and isinstance(content['text'], str):
key = f"{parent_key}_{field}".strip("_")
flat_fields[key] = content['text'].strip()
# Otherwise, recursively process nested dictionaries
else:
nested_fields = flatten_content_fields(content, f"{parent_key}_{field}")
flat_fields.update(nested_fields)
# Handle list content
elif isinstance(content, list):
formatted_content = format_structured_data(content)
if formatted_content:
key = f"{parent_key}_{field}".strip("_")
flat_fields[key] = formatted_content
return flat_fields
# Flatten the content structure
flat_content_fields = flatten_content_fields(result['ocr_contents'])
# Display the flattened content fields with proper formatting
for field, content in flat_content_fields.items():
# Skip any empty content
if not content or not content.strip():
continue
# Format field name as in the markdown export
field_display = field.replace('_', ' ')
# Maintain content purity - don't parse text content as JSON
# Historical text may contain curly braces that aren't JSON
# For raw_text field, display only the content without the field name
if field == 'raw_text':
st.markdown(f"{content}")
else:
# For other fields, display the field name in bold followed by the content
st.markdown(f"**{field}:** {content}")
# Add spacing between fields
st.markdown("\n\n")
# Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button
with json_tab:
# Use the same truncated JSON that's used in the download button
from utils.image_utils import truncate_base64_in_result
truncated_result = truncate_base64_in_result(result)
# Format the JSON prettily
json_str = json.dumps(truncated_result, indent=2)
# Display JSON with a copy button using Streamlit's built-in functionality
st.json(truncated_result)
# Images tab - for viewing document images
if has_images and img_tab:
with img_tab:
# Display each available image
for i, img in enumerate(images_to_display):
st.image(img['data'], caption=f"Image {i+1}", use_container_width=True)
# Display custom prompt if provided
if custom_prompt:
with st.expander("Custom Processing Instructions"):
st.write(custom_prompt)