Spaces:
Running
Running
File size: 20,437 Bytes
c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 3030658 c04ffe5 3030658 c04ffe5 42dc069 3030658 42dc069 c04ffe5 3030658 c04ffe5 3030658 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 42dc069 c04ffe5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 |
"""
UI utilities for OCR results display.
"""
import os
import streamlit as st
import json
import base64
import io
from datetime import datetime
from utils.text_utils import format_ocr_text
from utils.content_utils import classify_document_content, format_structured_data
def display_results(result, container, custom_prompt=""):
"""Display OCR results in the provided container"""
with container:
# Add heading for document metadata
st.markdown("### Document Metadata")
# Filter out large data structures from metadata display
meta = {k: v for k, v in result.items()
if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']}
# Create a compact metadata section for primary metadata
meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">'
# Document type
if 'detected_document_type' in meta:
meta_html += f'<div><strong>Type:</strong> {meta["detected_document_type"]}</div>'
# Page information
if 'limited_pages' in meta:
meta_html += f'<div><strong>Pages:</strong> {meta["limited_pages"]["processed"]}/{meta["limited_pages"]["total"]}</div>'
meta_html += '</div>'
st.markdown(meta_html, unsafe_allow_html=True)
# Processing time - separate section for proper ordering of all metadata fields
if 'processing_time' in meta:
time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
time_html += f'<div>{meta["processing_time"]:.1f}s</div>'
time_html += '</div>'
st.markdown(time_html, unsafe_allow_html=True)
# Language metadata on a separate line, Subject Tags below
# First show languages if available
if 'languages' in result and result['languages']:
languages = [lang for lang in result['languages'] if lang is not None]
if languages:
# Create a dedicated line for Languages
lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>'
# Add language tags
for lang in languages:
# Clean language name if needed
clean_lang = str(lang).strip()
if clean_lang: # Only add if not empty
lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>'
lang_html += '</div>'
st.markdown(lang_html, unsafe_allow_html=True)
# Prepare download files
try:
# Get base filename
from utils.general_utils import create_descriptive_filename
original_file = result.get('file_name', 'document')
base_name = create_descriptive_filename(original_file, result, "")
base_name = os.path.splitext(base_name)[0]
# 1. JSON download - with base64 data truncated for readability
from utils.image_utils import truncate_base64_in_result
truncated_result = truncate_base64_in_result(result)
json_str = json.dumps(truncated_result, indent=2)
json_filename = f"{base_name}.json"
json_b64 = base64.b64encode(json_str.encode()).decode()
# 2. Create ZIP with all files
from utils.image_utils import create_results_zip_in_memory
zip_data = create_results_zip_in_memory(result)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"{base_name}_{timestamp}.zip"
zip_b64 = base64.b64encode(zip_data).decode()
# Add download line with metadata styling
download_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>'
# Download links in order of importance, matching the zip file contents
download_html += f'<a href="data:application/json;base64,{json_b64}" download="{json_filename}" class="subject-tag tag-download">JSON</a>'
# Zip download link (packages everything together)
download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">Zip Archive</a>'
download_html += '</div>'
st.markdown(download_html, unsafe_allow_html=True)
except Exception as e:
# Silent fail for downloads - don't disrupt the UI
pass
# Create a separate line for Time if we have time-related tags
if 'topics' in result and result['topics']:
time_tags = [topic for topic in result['topics']
if any(term in topic.lower() for term in ["century", "pre-", "era"])]
if time_tags:
time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
for tag in time_tags:
time_html += f'<span class="subject-tag tag-time-period">{tag}</span>'
time_html += '</div>'
st.markdown(time_html, unsafe_allow_html=True)
# Then display remaining subject tags if available
if 'topics' in result and result['topics']:
# Filter out time-related tags which are already displayed
subject_tags = [topic for topic in result['topics']
if not any(term in topic.lower() for term in ["century", "pre-", "era"])]
if subject_tags:
# Create a separate line for Subject Tags
tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>'
tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">'
# Generate a badge for each remaining tag
for topic in subject_tags:
# Determine tag category class
tag_class = "subject-tag" # Default class
# Add specialized class based on category
if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
tag_class += " tag-language" # Languages
elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
tag_class += " tag-document-type" # Document types
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
tag_class += " tag-subject" # Subject domains
elif "historical" in topic.lower() and "document" in topic.lower():
tag_class += " tag-document-type" # "Historical Document Analysis" should be a document type
# Add each tag as an inline span
tags_html += f'<span class="{tag_class}">{topic}</span>'
# Close the containers
tags_html += '</div></div>'
# Render the subject tags section
st.markdown(tags_html, unsafe_allow_html=True)
# Check if we have OCR content
if 'ocr_contents' in result:
# Create a single view instead of tabs
content_tab1 = st.container()
# Check for images in the result to use later
has_images = result.get('has_images', False)
has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and
any('images' in page for page in result['raw_response_data']['pages']
if isinstance(page, dict)))
# Display structured content
with content_tab1:
# Display structured content with markdown formatting
if isinstance(result['ocr_contents'], dict):
# CSS is now handled in the main layout.py file
# Collect all available images from the result
available_images = []
if has_images and 'pages_data' in result:
for page_idx, page in enumerate(result['pages_data']):
if 'images' in page and len(page['images']) > 0:
for img_idx, img in enumerate(page['images']):
if 'image_base64' in img:
available_images.append({
'source': 'pages_data',
'page': page_idx,
'index': img_idx,
'data': img['image_base64']
})
# Get images from raw response as well
if 'raw_response_data' in result:
raw_data = result['raw_response_data']
if isinstance(raw_data, dict) and 'pages' in raw_data:
for page_idx, page in enumerate(raw_data['pages']):
if isinstance(page, dict) and 'images' in page:
for img_idx, img in enumerate(page['images']):
if isinstance(img, dict) and 'base64' in img:
available_images.append({
'source': 'raw_response',
'page': page_idx,
'index': img_idx,
'data': img['base64']
})
# Extract images for display at the top
images_to_display = []
# First, collect all available images
for img_idx, img in enumerate(available_images):
if 'data' in img:
images_to_display.append({
'data': img['data'],
'id': img.get('id', f"img_{img_idx}"),
'index': img_idx
})
# Image display now only happens in the Images tab
# Organize sections in a logical order - prioritize main_text
section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"]
ordered_sections = []
# Add known sections first in preferred order
for section_name in section_order:
if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
ordered_sections.append(section_name)
# Add any remaining sections
for section in result['ocr_contents'].keys():
if (section not in ordered_sections and
section not in ['error', 'partial_text'] and
result['ocr_contents'][section]):
ordered_sections.append(section)
# If only raw_text is available and no other content, add it last
if ('raw_text' in result['ocr_contents'] and
result['ocr_contents']['raw_text'] and
len(ordered_sections) == 0):
ordered_sections.append('raw_text')
# Add minimal spacing before OCR results
st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True)
# Create tabs for different views
if has_images:
tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
doc_tab, json_tab, img_tab = tabs
else:
tabs = st.tabs(["Document Content", "Raw JSON"])
doc_tab, json_tab = tabs
img_tab = None
# Document Content tab with simple, clean formatting that matches markdown export files
with doc_tab:
# Create a single unified content section
st.markdown("## Text Content")
# Present content directly in the format used in markdown export files
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict):
# Get all content fields that should be displayed
content_fields = {}
# Add all available content fields (left_page, right_page, etc)
for field, content in result['ocr_contents'].items():
# Skip certain fields that shouldn't be displayed
if field in ['error', 'partial_text'] or not content:
continue
# Clean the content if it's a string
if isinstance(content, str) and content.strip():
content_fields[field] = content.strip()
# Handle dictionary or list content
elif isinstance(content, (dict, list)):
formatted_content = format_structured_data(content)
if formatted_content:
content_fields[field] = formatted_content
# Process nested dictionary structures
def flatten_content_fields(fields, parent_key=""):
flat_fields = {}
for field, content in fields.items():
# Skip certain fields
if field in ['error', 'partial_text'] or not content:
continue
# Handle string content
if isinstance(content, str) and content.strip():
key = f"{parent_key}_{field}".strip("_")
flat_fields[key] = content.strip()
# Handle dictionary content
elif isinstance(content, dict):
# If the dictionary has a 'text' key, extract just that value
if 'text' in content and isinstance(content['text'], str):
key = f"{parent_key}_{field}".strip("_")
flat_fields[key] = content['text'].strip()
# Otherwise, recursively process nested dictionaries
else:
nested_fields = flatten_content_fields(content, f"{parent_key}_{field}")
flat_fields.update(nested_fields)
# Handle list content
elif isinstance(content, list):
formatted_content = format_structured_data(content)
if formatted_content:
key = f"{parent_key}_{field}".strip("_")
flat_fields[key] = formatted_content
return flat_fields
# Flatten the content structure
flat_content_fields = flatten_content_fields(result['ocr_contents'])
# Display the flattened content fields with proper formatting
for field, content in flat_content_fields.items():
# Skip any empty content
if not content or not content.strip():
continue
# Format field name as in the markdown export
field_display = field.replace('_', ' ')
# Maintain content purity - don't parse text content as JSON
# Historical text may contain curly braces that aren't JSON
# For raw_text field, display only the content without the field name
if field == 'raw_text':
st.markdown(f"{content}")
else:
# For other fields, display the field name in bold followed by the content
st.markdown(f"**{field}:** {content}")
# Add spacing between fields
st.markdown("\n\n")
# Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button
with json_tab:
# Use the same truncated JSON that's used in the download button
from utils.image_utils import truncate_base64_in_result
truncated_result = truncate_base64_in_result(result)
# Format the JSON prettily
json_str = json.dumps(truncated_result, indent=2)
# Display JSON with a copy button using Streamlit's built-in functionality
st.json(truncated_result)
# Images tab - for viewing document images
if has_images and img_tab:
with img_tab:
# Display each available image
for i, img in enumerate(images_to_display):
st.image(img['data'], caption=f"Image {i+1}", use_container_width=True)
# Display custom prompt if provided
if custom_prompt:
with st.expander("Custom Processing Instructions"):
st.write(custom_prompt)
|