historical-ocr / utils /ui_utils.py
milwright's picture
Fix metadata field ordering and tag classification issues
3030658
"""
UI utilities for OCR results display.
"""
import os
import streamlit as st
import json
import base64
import io
from datetime import datetime
from utils.text_utils import format_ocr_text
from utils.content_utils import classify_document_content, format_structured_data
def display_results(result, container, custom_prompt=""):
"""Display OCR results in the provided container"""
with container:
# Add heading for document metadata
st.markdown("### Document Metadata")
# Filter out large data structures from metadata display
meta = {k: v for k, v in result.items()
if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']}
# Create a compact metadata section for primary metadata
meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">'
# Document type
if 'detected_document_type' in meta:
meta_html += f'<div><strong>Type:</strong> {meta["detected_document_type"]}</div>'
# Page information
if 'limited_pages' in meta:
meta_html += f'<div><strong>Pages:</strong> {meta["limited_pages"]["processed"]}/{meta["limited_pages"]["total"]}</div>'
meta_html += '</div>'
st.markdown(meta_html, unsafe_allow_html=True)
# Processing time - separate section for proper ordering of all metadata fields
if 'processing_time' in meta:
time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
time_html += f'<div>{meta["processing_time"]:.1f}s</div>'
time_html += '</div>'
st.markdown(time_html, unsafe_allow_html=True)
# Language metadata on a separate line, Subject Tags below
# First show languages if available
if 'languages' in result and result['languages']:
languages = [lang for lang in result['languages'] if lang is not None]
if languages:
# Create a dedicated line for Languages
lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>'
# Add language tags
for lang in languages:
# Clean language name if needed
clean_lang = str(lang).strip()
if clean_lang: # Only add if not empty
lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>'
lang_html += '</div>'
st.markdown(lang_html, unsafe_allow_html=True)
# Prepare download files
try:
# Get base filename
from utils.general_utils import create_descriptive_filename
original_file = result.get('file_name', 'document')
base_name = create_descriptive_filename(original_file, result, "")
base_name = os.path.splitext(base_name)[0]
# 1. JSON download - with base64 data truncated for readability
from utils.image_utils import truncate_base64_in_result
truncated_result = truncate_base64_in_result(result)
json_str = json.dumps(truncated_result, indent=2)
json_filename = f"{base_name}.json"
json_b64 = base64.b64encode(json_str.encode()).decode()
# 2. Create ZIP with all files
from utils.image_utils import create_results_zip_in_memory
zip_data = create_results_zip_in_memory(result)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"{base_name}_{timestamp}.zip"
zip_b64 = base64.b64encode(zip_data).decode()
# Add download line with metadata styling
download_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>'
# Download links in order of importance, matching the zip file contents
download_html += f'<a href="data:application/json;base64,{json_b64}" download="{json_filename}" class="subject-tag tag-download">JSON</a>'
# Zip download link (packages everything together)
download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">Zip Archive</a>'
download_html += '</div>'
st.markdown(download_html, unsafe_allow_html=True)
except Exception as e:
# Silent fail for downloads - don't disrupt the UI
pass
# Create a separate line for Time if we have time-related tags
if 'topics' in result and result['topics']:
time_tags = [topic for topic in result['topics']
if any(term in topic.lower() for term in ["century", "pre-", "era"])]
if time_tags:
time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
for tag in time_tags:
time_html += f'<span class="subject-tag tag-time-period">{tag}</span>'
time_html += '</div>'
st.markdown(time_html, unsafe_allow_html=True)
# Then display remaining subject tags if available
if 'topics' in result and result['topics']:
# Filter out time-related tags which are already displayed
subject_tags = [topic for topic in result['topics']
if not any(term in topic.lower() for term in ["century", "pre-", "era"])]
if subject_tags:
# Create a separate line for Subject Tags
tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>'
tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">'
# Generate a badge for each remaining tag
for topic in subject_tags:
# Determine tag category class
tag_class = "subject-tag" # Default class
# Add specialized class based on category
if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
tag_class += " tag-language" # Languages
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
tag_class += " tag-document-type" # Document types
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
tag_class += " tag-subject" # Subject domains
elif "historical" in topic.lower() and "document" in topic.lower():
tag_class += " tag-document-type" # "Historical Document Analysis" should be a document type
# Add each tag as an inline span
tags_html += f'<span class="{tag_class}">{topic}</span>'
# Close the containers
tags_html += '</div></div>'
# Render the subject tags section
st.markdown(tags_html, unsafe_allow_html=True)
# Check if we have OCR content
if 'ocr_contents' in result:
# Create a single view instead of tabs
content_tab1 = st.container()
# Check for images in the result to use later
has_images = result.get('has_images', False)
has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and
any('images' in page for page in result['raw_response_data']['pages']
if isinstance(page, dict)))
# Display structured content
with content_tab1:
# Display structured content with markdown formatting
if isinstance(result['ocr_contents'], dict):
# CSS is now handled in the main layout.py file
# Collect all available images from the result
available_images = []
if has_images and 'pages_data' in result:
for page_idx, page in enumerate(result['pages_data']):
if 'images' in page and len(page['images']) > 0:
for img_idx, img in enumerate(page['images']):
if 'image_base64' in img:
available_images.append({
'source': 'pages_data',
'page': page_idx,
'index': img_idx,
'data': img['image_base64']
})
# Get images from raw response as well
if 'raw_response_data' in result:
raw_data = result['raw_response_data']
if isinstance(raw_data, dict) and 'pages' in raw_data:
for page_idx, page in enumerate(raw_data['pages']):
if isinstance(page, dict) and 'images' in page:
for img_idx, img in enumerate(page['images']):
if isinstance(img, dict) and 'base64' in img:
available_images.append({
'source': 'raw_response',
'page': page_idx,
'index': img_idx,
'data': img['base64']
})
# Extract images for display at the top
images_to_display = []
# First, collect all available images
for img_idx, img in enumerate(available_images):
if 'data' in img:
images_to_display.append({
'data': img['data'],
'id': img.get('id', f"img_{img_idx}"),
'index': img_idx
})
# Image display now only happens in the Images tab
# Organize sections in a logical order - prioritize main_text
section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"]
ordered_sections = []
# Add known sections first in preferred order
for section_name in section_order:
if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
ordered_sections.append(section_name)
# Add any remaining sections
for section in result['ocr_contents'].keys():
if (section not in ordered_sections and
section not in ['error', 'partial_text'] and
result['ocr_contents'][section]):
ordered_sections.append(section)
# If only raw_text is available and no other content, add it last
if ('raw_text' in result['ocr_contents'] and
result['ocr_contents']['raw_text'] and
len(ordered_sections) == 0):
ordered_sections.append('raw_text')
# Add minimal spacing before OCR results
st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True)
# Create tabs for different views
if has_images:
tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
doc_tab, json_tab, img_tab = tabs
else:
tabs = st.tabs(["Document Content", "Raw JSON"])
doc_tab, json_tab = tabs
img_tab = None
# Document Content tab with simple, clean formatting that matches markdown export files
with doc_tab:
# Create a single unified content section
st.markdown("## Text Content")
# Present content directly in the format used in markdown export files
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict):
# Get all content fields that should be displayed
content_fields = {}
# Add all available content fields (left_page, right_page, etc)
for field, content in result['ocr_contents'].items():
# Skip certain fields that shouldn't be displayed
if field in ['error', 'partial_text'] or not content:
continue
# Clean the content if it's a string
if isinstance(content, str) and content.strip():
content_fields[field] = content.strip()
# Handle dictionary or list content
elif isinstance(content, (dict, list)):
formatted_content = format_structured_data(content)
if formatted_content:
content_fields[field] = formatted_content
# Process nested dictionary structures
def flatten_content_fields(fields, parent_key=""):
flat_fields = {}
for field, content in fields.items():
# Skip certain fields
if field in ['error', 'partial_text'] or not content:
continue
# Handle string content
if isinstance(content, str) and content.strip():
key = f"{parent_key}_{field}".strip("_")
flat_fields[key] = content.strip()
# Handle dictionary content
elif isinstance(content, dict):
# If the dictionary has a 'text' key, extract just that value
if 'text' in content and isinstance(content['text'], str):
key = f"{parent_key}_{field}".strip("_")
flat_fields[key] = content['text'].strip()
# Otherwise, recursively process nested dictionaries
else:
nested_fields = flatten_content_fields(content, f"{parent_key}_{field}")
flat_fields.update(nested_fields)
# Handle list content
elif isinstance(content, list):
formatted_content = format_structured_data(content)
if formatted_content:
key = f"{parent_key}_{field}".strip("_")
flat_fields[key] = formatted_content
return flat_fields
# Flatten the content structure
flat_content_fields = flatten_content_fields(result['ocr_contents'])
# Display the flattened content fields with proper formatting
for field, content in flat_content_fields.items():
# Skip any empty content
if not content or not content.strip():
continue
# Format field name as in the markdown export
field_display = field.replace('_', ' ')
# Maintain content purity - don't parse text content as JSON
# Historical text may contain curly braces that aren't JSON
# For raw_text field, display only the content without the field name
if field == 'raw_text':
st.markdown(f"{content}")
else:
# For other fields, display the field name in bold followed by the content
st.markdown(f"**{field}:** {content}")
# Add spacing between fields
st.markdown("\n\n")
# Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button
with json_tab:
# Use the same truncated JSON that's used in the download button
from utils.image_utils import truncate_base64_in_result
truncated_result = truncate_base64_in_result(result)
# Format the JSON prettily
json_str = json.dumps(truncated_result, indent=2)
# Display JSON with a copy button using Streamlit's built-in functionality
st.json(truncated_result)
# Images tab - for viewing document images
if has_images and img_tab:
with img_tab:
# Display each available image
for i, img in enumerate(images_to_display):
st.image(img['data'], caption=f"Image {i+1}", use_container_width=True)
# Display custom prompt if provided
if custom_prompt:
with st.expander("Custom Processing Instructions"):
st.write(custom_prompt)