Spaces:
Running
Running
""" | |
UI utilities for OCR results display. | |
""" | |
import os | |
import streamlit as st | |
import json | |
import base64 | |
import io | |
from datetime import datetime | |
from utils.text_utils import format_ocr_text | |
from utils.content_utils import classify_document_content, format_structured_data | |
def display_results(result, container, custom_prompt=""): | |
"""Display OCR results in the provided container""" | |
with container: | |
# Add heading for document metadata | |
st.markdown("### Document Metadata") | |
# Filter out large data structures from metadata display | |
meta = {k: v for k, v in result.items() | |
if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']} | |
# Create a compact metadata section for primary metadata | |
meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">' | |
# Document type | |
if 'detected_document_type' in meta: | |
meta_html += f'<div><strong>Type:</strong> {meta["detected_document_type"]}</div>' | |
# Page information | |
if 'limited_pages' in meta: | |
meta_html += f'<div><strong>Pages:</strong> {meta["limited_pages"]["processed"]}/{meta["limited_pages"]["total"]}</div>' | |
meta_html += '</div>' | |
st.markdown(meta_html, unsafe_allow_html=True) | |
# Processing time - separate section for proper ordering of all metadata fields | |
if 'processing_time' in meta: | |
time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>' | |
time_html += f'<div>{meta["processing_time"]:.1f}s</div>' | |
time_html += '</div>' | |
st.markdown(time_html, unsafe_allow_html=True) | |
# Language metadata on a separate line, Subject Tags below | |
# First show languages if available | |
if 'languages' in result and result['languages']: | |
languages = [lang for lang in result['languages'] if lang is not None] | |
if languages: | |
# Create a dedicated line for Languages | |
lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>' | |
# Add language tags | |
for lang in languages: | |
# Clean language name if needed | |
clean_lang = str(lang).strip() | |
if clean_lang: # Only add if not empty | |
lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>' | |
lang_html += '</div>' | |
st.markdown(lang_html, unsafe_allow_html=True) | |
# Prepare download files | |
try: | |
# Get base filename | |
from utils.general_utils import create_descriptive_filename | |
original_file = result.get('file_name', 'document') | |
base_name = create_descriptive_filename(original_file, result, "") | |
base_name = os.path.splitext(base_name)[0] | |
# 1. JSON download - with base64 data truncated for readability | |
from utils.image_utils import truncate_base64_in_result | |
truncated_result = truncate_base64_in_result(result) | |
json_str = json.dumps(truncated_result, indent=2) | |
json_filename = f"{base_name}.json" | |
json_b64 = base64.b64encode(json_str.encode()).decode() | |
# 2. Create ZIP with all files | |
from utils.image_utils import create_results_zip_in_memory | |
zip_data = create_results_zip_in_memory(result) | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
zip_filename = f"{base_name}_{timestamp}.zip" | |
zip_b64 = base64.b64encode(zip_data).decode() | |
# Add download line with metadata styling | |
download_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>' | |
# Download links in order of importance, matching the zip file contents | |
download_html += f'<a href="data:application/json;base64,{json_b64}" download="{json_filename}" class="subject-tag tag-download">JSON</a>' | |
# Zip download link (packages everything together) | |
download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">Zip Archive</a>' | |
download_html += '</div>' | |
st.markdown(download_html, unsafe_allow_html=True) | |
except Exception as e: | |
# Silent fail for downloads - don't disrupt the UI | |
pass | |
# Create a separate line for Time if we have time-related tags | |
if 'topics' in result and result['topics']: | |
time_tags = [topic for topic in result['topics'] | |
if any(term in topic.lower() for term in ["century", "pre-", "era"])] | |
if time_tags: | |
time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>' | |
for tag in time_tags: | |
time_html += f'<span class="subject-tag tag-time-period">{tag}</span>' | |
time_html += '</div>' | |
st.markdown(time_html, unsafe_allow_html=True) | |
# Then display remaining subject tags if available | |
if 'topics' in result and result['topics']: | |
# Filter out time-related tags which are already displayed | |
subject_tags = [topic for topic in result['topics'] | |
if not any(term in topic.lower() for term in ["century", "pre-", "era"])] | |
if subject_tags: | |
# Create a separate line for Subject Tags | |
tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>' | |
tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">' | |
# Generate a badge for each remaining tag | |
for topic in subject_tags: | |
# Determine tag category class | |
tag_class = "subject-tag" # Default class | |
# Add specialized class based on category | |
if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]): | |
tag_class += " tag-language" # Languages | |
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]): | |
tag_class += " tag-document-type" # Document types | |
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]): | |
tag_class += " tag-subject" # Subject domains | |
elif "historical" in topic.lower() and "document" in topic.lower(): | |
tag_class += " tag-document-type" # "Historical Document Analysis" should be a document type | |
# Add each tag as an inline span | |
tags_html += f'<span class="{tag_class}">{topic}</span>' | |
# Close the containers | |
tags_html += '</div></div>' | |
# Render the subject tags section | |
st.markdown(tags_html, unsafe_allow_html=True) | |
# Check if we have OCR content | |
if 'ocr_contents' in result: | |
# Create a single view instead of tabs | |
content_tab1 = st.container() | |
# Check for images in the result to use later | |
has_images = result.get('has_images', False) | |
has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', []))) | |
has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and | |
any('images' in page for page in result['raw_response_data']['pages'] | |
if isinstance(page, dict))) | |
# Display structured content | |
with content_tab1: | |
# Display structured content with markdown formatting | |
if isinstance(result['ocr_contents'], dict): | |
# CSS is now handled in the main layout.py file | |
# Collect all available images from the result | |
available_images = [] | |
if has_images and 'pages_data' in result: | |
for page_idx, page in enumerate(result['pages_data']): | |
if 'images' in page and len(page['images']) > 0: | |
for img_idx, img in enumerate(page['images']): | |
if 'image_base64' in img: | |
available_images.append({ | |
'source': 'pages_data', | |
'page': page_idx, | |
'index': img_idx, | |
'data': img['image_base64'] | |
}) | |
# Get images from raw response as well | |
if 'raw_response_data' in result: | |
raw_data = result['raw_response_data'] | |
if isinstance(raw_data, dict) and 'pages' in raw_data: | |
for page_idx, page in enumerate(raw_data['pages']): | |
if isinstance(page, dict) and 'images' in page: | |
for img_idx, img in enumerate(page['images']): | |
if isinstance(img, dict) and 'base64' in img: | |
available_images.append({ | |
'source': 'raw_response', | |
'page': page_idx, | |
'index': img_idx, | |
'data': img['base64'] | |
}) | |
# Extract images for display at the top | |
images_to_display = [] | |
# First, collect all available images | |
for img_idx, img in enumerate(available_images): | |
if 'data' in img: | |
images_to_display.append({ | |
'data': img['data'], | |
'id': img.get('id', f"img_{img_idx}"), | |
'index': img_idx | |
}) | |
# Image display now only happens in the Images tab | |
# Organize sections in a logical order - prioritize main_text | |
section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"] | |
ordered_sections = [] | |
# Add known sections first in preferred order | |
for section_name in section_order: | |
if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]: | |
ordered_sections.append(section_name) | |
# Add any remaining sections | |
for section in result['ocr_contents'].keys(): | |
if (section not in ordered_sections and | |
section not in ['error', 'partial_text'] and | |
result['ocr_contents'][section]): | |
ordered_sections.append(section) | |
# If only raw_text is available and no other content, add it last | |
if ('raw_text' in result['ocr_contents'] and | |
result['ocr_contents']['raw_text'] and | |
len(ordered_sections) == 0): | |
ordered_sections.append('raw_text') | |
# Add minimal spacing before OCR results | |
st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True) | |
# Create tabs for different views | |
if has_images: | |
tabs = st.tabs(["Document Content", "Raw JSON", "Images"]) | |
doc_tab, json_tab, img_tab = tabs | |
else: | |
tabs = st.tabs(["Document Content", "Raw JSON"]) | |
doc_tab, json_tab = tabs | |
img_tab = None | |
# Document Content tab with simple, clean formatting that matches markdown export files | |
with doc_tab: | |
# Create a single unified content section | |
st.markdown("## Text Content") | |
# Present content directly in the format used in markdown export files | |
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict): | |
# Get all content fields that should be displayed | |
content_fields = {} | |
# Add all available content fields (left_page, right_page, etc) | |
for field, content in result['ocr_contents'].items(): | |
# Skip certain fields that shouldn't be displayed | |
if field in ['error', 'partial_text'] or not content: | |
continue | |
# Clean the content if it's a string | |
if isinstance(content, str) and content.strip(): | |
content_fields[field] = content.strip() | |
# Handle dictionary or list content | |
elif isinstance(content, (dict, list)): | |
formatted_content = format_structured_data(content) | |
if formatted_content: | |
content_fields[field] = formatted_content | |
# Process nested dictionary structures | |
def flatten_content_fields(fields, parent_key=""): | |
flat_fields = {} | |
for field, content in fields.items(): | |
# Skip certain fields | |
if field in ['error', 'partial_text'] or not content: | |
continue | |
# Handle string content | |
if isinstance(content, str) and content.strip(): | |
key = f"{parent_key}_{field}".strip("_") | |
flat_fields[key] = content.strip() | |
# Handle dictionary content | |
elif isinstance(content, dict): | |
# If the dictionary has a 'text' key, extract just that value | |
if 'text' in content and isinstance(content['text'], str): | |
key = f"{parent_key}_{field}".strip("_") | |
flat_fields[key] = content['text'].strip() | |
# Otherwise, recursively process nested dictionaries | |
else: | |
nested_fields = flatten_content_fields(content, f"{parent_key}_{field}") | |
flat_fields.update(nested_fields) | |
# Handle list content | |
elif isinstance(content, list): | |
formatted_content = format_structured_data(content) | |
if formatted_content: | |
key = f"{parent_key}_{field}".strip("_") | |
flat_fields[key] = formatted_content | |
return flat_fields | |
# Flatten the content structure | |
flat_content_fields = flatten_content_fields(result['ocr_contents']) | |
# Display the flattened content fields with proper formatting | |
for field, content in flat_content_fields.items(): | |
# Skip any empty content | |
if not content or not content.strip(): | |
continue | |
# Format field name as in the markdown export | |
field_display = field.replace('_', ' ') | |
# Maintain content purity - don't parse text content as JSON | |
# Historical text may contain curly braces that aren't JSON | |
# For raw_text field, display only the content without the field name | |
if field == 'raw_text': | |
st.markdown(f"{content}") | |
else: | |
# For other fields, display the field name in bold followed by the content | |
st.markdown(f"**{field}:** {content}") | |
# Add spacing between fields | |
st.markdown("\n\n") | |
# Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button | |
with json_tab: | |
# Use the same truncated JSON that's used in the download button | |
from utils.image_utils import truncate_base64_in_result | |
truncated_result = truncate_base64_in_result(result) | |
# Format the JSON prettily | |
json_str = json.dumps(truncated_result, indent=2) | |
# Display JSON with a copy button using Streamlit's built-in functionality | |
st.json(truncated_result) | |
# Images tab - for viewing document images | |
if has_images and img_tab: | |
with img_tab: | |
# Display each available image | |
for i, img in enumerate(images_to_display): | |
st.image(img['data'], caption=f"Image {i+1}", use_container_width=True) | |
# Display custom prompt if provided | |
if custom_prompt: | |
with st.expander("Custom Processing Instructions"): | |
st.write(custom_prompt) | |