Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / utils /ui_utils.py

milwright

Fix metadata field ordering and tag classification issues

3030658 2 months ago

raw

history blame contribute delete

20.4 kB

	"""
	UI utilities for OCR results display.
	"""
	import os
	import streamlit as st
	import json
	import base64
	import io
	from datetime import datetime

	from utils.text_utils import format_ocr_text
	from utils.content_utils import classify_document_content, format_structured_data

	def display_results(result, container, custom_prompt=""):
	"""Display OCR results in the provided container"""
	with container:
	# Add heading for document metadata
	st.markdown("### Document Metadata")

	# Filter out large data structures from metadata display
	meta = {k: v for k, v in result.items()
	if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']}

	# Create a compact metadata section for primary metadata
	meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">'

	# Document type
	if 'detected_document_type' in meta:
	meta_html += f'<div><strong>Type:</strong> {meta["detected_document_type"]}</div>'

	# Page information
	if 'limited_pages' in meta:
	meta_html += f'<div><strong>Pages:</strong> {meta["limited_pages"]["processed"]}/{meta["limited_pages"]["total"]}</div>'

	meta_html += '</div>'
	st.markdown(meta_html, unsafe_allow_html=True)

	# Processing time - separate section for proper ordering of all metadata fields
	if 'processing_time' in meta:
	time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
	time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
	time_html += f'<div>{meta["processing_time"]:.1f}s</div>'
	time_html += '</div>'
	st.markdown(time_html, unsafe_allow_html=True)

	# Language metadata on a separate line, Subject Tags below

	# First show languages if available
	if 'languages' in result and result['languages']:
	languages = [lang for lang in result['languages'] if lang is not None]
	if languages:
	# Create a dedicated line for Languages
	lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
	lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>'

	# Add language tags
	for lang in languages:
	# Clean language name if needed
	clean_lang = str(lang).strip()
	if clean_lang: # Only add if not empty
	lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>'

	lang_html += '</div>'
	st.markdown(lang_html, unsafe_allow_html=True)

	# Prepare download files
	try:
	# Get base filename
	from utils.general_utils import create_descriptive_filename
	original_file = result.get('file_name', 'document')
	base_name = create_descriptive_filename(original_file, result, "")
	base_name = os.path.splitext(base_name)[0]

	# 1. JSON download - with base64 data truncated for readability
	from utils.image_utils import truncate_base64_in_result
	truncated_result = truncate_base64_in_result(result)
	json_str = json.dumps(truncated_result, indent=2)
	json_filename = f"{base_name}.json"
	json_b64 = base64.b64encode(json_str.encode()).decode()

	# 2. Create ZIP with all files
	from utils.image_utils import create_results_zip_in_memory
	zip_data = create_results_zip_in_memory(result)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	zip_filename = f"{base_name}_{timestamp}.zip"
	zip_b64 = base64.b64encode(zip_data).decode()

	# Add download line with metadata styling
	download_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
	download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>'

	# Download links in order of importance, matching the zip file contents
	download_html += f'<a href="data:application/json;base64,{json_b64}" download="{json_filename}" class="subject-tag tag-download">JSON</a>'

	# Zip download link (packages everything together)
	download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">Zip Archive</a>'

	download_html += '</div>'
	st.markdown(download_html, unsafe_allow_html=True)
	except Exception as e:
	# Silent fail for downloads - don't disrupt the UI
	pass

	# Create a separate line for Time if we have time-related tags
	if 'topics' in result and result['topics']:
	time_tags = [topic for topic in result['topics']
	if any(term in topic.lower() for term in ["century", "pre-", "era"])]
	if time_tags:
	time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
	time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
	for tag in time_tags:
	time_html += f'<span class="subject-tag tag-time-period">{tag}</span>'
	time_html += '</div>'
	st.markdown(time_html, unsafe_allow_html=True)

	# Then display remaining subject tags if available
	if 'topics' in result and result['topics']:
	# Filter out time-related tags which are already displayed
	subject_tags = [topic for topic in result['topics']
	if not any(term in topic.lower() for term in ["century", "pre-", "era"])]

	if subject_tags:
	# Create a separate line for Subject Tags
	tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
	tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>'
	tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">'

	# Generate a badge for each remaining tag
	for topic in subject_tags:
	# Determine tag category class
	tag_class = "subject-tag" # Default class

	# Add specialized class based on category
	if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
	tag_class += " tag-language" # Languages
	elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
	tag_class += " tag-document-type" # Document types
	elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
	tag_class += " tag-subject" # Subject domains
	elif "historical" in topic.lower() and "document" in topic.lower():
	tag_class += " tag-document-type" # "Historical Document Analysis" should be a document type

	# Add each tag as an inline span
	tags_html += f'<span class="{tag_class}">{topic}</span>'

	# Close the containers
	tags_html += '</div></div>'

	# Render the subject tags section
	st.markdown(tags_html, unsafe_allow_html=True)

	# Check if we have OCR content
	if 'ocr_contents' in result:
	# Create a single view instead of tabs
	content_tab1 = st.container()

	# Check for images in the result to use later
	has_images = result.get('has_images', False)
	has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
	has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and
	any('images' in page for page in result['raw_response_data']['pages']
	if isinstance(page, dict)))

	# Display structured content
	with content_tab1:
	# Display structured content with markdown formatting
	if isinstance(result['ocr_contents'], dict):
	# CSS is now handled in the main layout.py file

	# Collect all available images from the result
	available_images = []
	if has_images and 'pages_data' in result:
	for page_idx, page in enumerate(result['pages_data']):
	if 'images' in page and len(page['images']) > 0:
	for img_idx, img in enumerate(page['images']):
	if 'image_base64' in img:
	available_images.append({
	'source': 'pages_data',
	'page': page_idx,
	'index': img_idx,
	'data': img['image_base64']
	})

	# Get images from raw response as well
	if 'raw_response_data' in result:
	raw_data = result['raw_response_data']
	if isinstance(raw_data, dict) and 'pages' in raw_data:
	for page_idx, page in enumerate(raw_data['pages']):
	if isinstance(page, dict) and 'images' in page:
	for img_idx, img in enumerate(page['images']):
	if isinstance(img, dict) and 'base64' in img:
	available_images.append({
	'source': 'raw_response',
	'page': page_idx,
	'index': img_idx,
	'data': img['base64']
	})

	# Extract images for display at the top
	images_to_display = []

	# First, collect all available images
	for img_idx, img in enumerate(available_images):
	if 'data' in img:
	images_to_display.append({
	'data': img['data'],
	'id': img.get('id', f"img_{img_idx}"),
	'index': img_idx
	})

	# Image display now only happens in the Images tab

	# Organize sections in a logical order - prioritize main_text
	section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"]
	ordered_sections = []

	# Add known sections first in preferred order
	for section_name in section_order:
	if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
	ordered_sections.append(section_name)

	# Add any remaining sections
	for section in result['ocr_contents'].keys():
	if (section not in ordered_sections and
	section not in ['error', 'partial_text'] and
	result['ocr_contents'][section]):
	ordered_sections.append(section)

	# If only raw_text is available and no other content, add it last
	if ('raw_text' in result['ocr_contents'] and
	result['ocr_contents']['raw_text'] and
	len(ordered_sections) == 0):
	ordered_sections.append('raw_text')

	# Add minimal spacing before OCR results
	st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True)

	# Create tabs for different views
	if has_images:
	tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
	doc_tab, json_tab, img_tab = tabs
	else:
	tabs = st.tabs(["Document Content", "Raw JSON"])
	doc_tab, json_tab = tabs
	img_tab = None

	# Document Content tab with simple, clean formatting that matches markdown export files
	with doc_tab:
	# Create a single unified content section
	st.markdown("## Text Content")

	# Present content directly in the format used in markdown export files
	if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict):
	# Get all content fields that should be displayed
	content_fields = {}

	# Add all available content fields (left_page, right_page, etc)
	for field, content in result['ocr_contents'].items():
	# Skip certain fields that shouldn't be displayed
	if field in ['error', 'partial_text'] or not content:
	continue

	# Clean the content if it's a string
	if isinstance(content, str) and content.strip():
	content_fields[field] = content.strip()
	# Handle dictionary or list content
	elif isinstance(content, (dict, list)):
	formatted_content = format_structured_data(content)
	if formatted_content:
	content_fields[field] = formatted_content

	# Process nested dictionary structures
	def flatten_content_fields(fields, parent_key=""):
	flat_fields = {}
	for field, content in fields.items():
	# Skip certain fields
	if field in ['error', 'partial_text'] or not content:
	continue

	# Handle string content
	if isinstance(content, str) and content.strip():
	key = f"{parent_key}_{field}".strip("_")
	flat_fields[key] = content.strip()
	# Handle dictionary content
	elif isinstance(content, dict):
	# If the dictionary has a 'text' key, extract just that value
	if 'text' in content and isinstance(content['text'], str):
	key = f"{parent_key}_{field}".strip("_")
	flat_fields[key] = content['text'].strip()
	# Otherwise, recursively process nested dictionaries
	else:
	nested_fields = flatten_content_fields(content, f"{parent_key}_{field}")
	flat_fields.update(nested_fields)
	# Handle list content
	elif isinstance(content, list):
	formatted_content = format_structured_data(content)
	if formatted_content:
	key = f"{parent_key}_{field}".strip("_")
	flat_fields[key] = formatted_content

	return flat_fields

	# Flatten the content structure
	flat_content_fields = flatten_content_fields(result['ocr_contents'])

	# Display the flattened content fields with proper formatting
	for field, content in flat_content_fields.items():
	# Skip any empty content
	if not content or not content.strip():
	continue

	# Format field name as in the markdown export
	field_display = field.replace('_', ' ')

	# Maintain content purity - don't parse text content as JSON
	# Historical text may contain curly braces that aren't JSON

	# For raw_text field, display only the content without the field name
	if field == 'raw_text':
	st.markdown(f"{content}")
	else:
	# For other fields, display the field name in bold followed by the content
	st.markdown(f"{field}: {content}")

	# Add spacing between fields
	st.markdown("\n\n")

	# Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button
	with json_tab:
	# Use the same truncated JSON that's used in the download button
	from utils.image_utils import truncate_base64_in_result
	truncated_result = truncate_base64_in_result(result)

	# Format the JSON prettily
	json_str = json.dumps(truncated_result, indent=2)

	# Display JSON with a copy button using Streamlit's built-in functionality
	st.json(truncated_result)


	# Images tab - for viewing document images
	if has_images and img_tab:
	with img_tab:
	# Display each available image
	for i, img in enumerate(images_to_display):
	st.image(img['data'], caption=f"Image {i+1}", use_container_width=True)

	# Display custom prompt if provided
	if custom_prompt:
	with st.expander("Custom Processing Instructions"):
	st.write(custom_prompt)