""" UI utilities for OCR results display. """ import os import streamlit as st import json import base64 import io from datetime import datetime from utils.text_utils import format_ocr_text from utils.content_utils import classify_document_content, format_structured_data def display_results(result, container, custom_prompt=""): """Display OCR results in the provided container""" with container: # Add heading for document metadata st.markdown("### Document Metadata") # Filter out large data structures from metadata display meta = {k: v for k, v in result.items() if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']} # Create a compact metadata section for primary metadata meta_html = '
' # Document type if 'detected_document_type' in meta: meta_html += f'
Type: {meta["detected_document_type"]}
' # Page information if 'limited_pages' in meta: meta_html += f'
Pages: {meta["limited_pages"]["processed"]}/{meta["limited_pages"]["total"]}
' meta_html += '
' st.markdown(meta_html, unsafe_allow_html=True) # Processing time - separate section for proper ordering of all metadata fields if 'processing_time' in meta: time_html = '
' time_html += '
Time:
' time_html += f'
{meta["processing_time"]:.1f}s
' time_html += '
' st.markdown(time_html, unsafe_allow_html=True) # Language metadata on a separate line, Subject Tags below # First show languages if available if 'languages' in result and result['languages']: languages = [lang for lang in result['languages'] if lang is not None] if languages: # Create a dedicated line for Languages lang_html = '
' lang_html += '
Language:
' # Add language tags for lang in languages: # Clean language name if needed clean_lang = str(lang).strip() if clean_lang: # Only add if not empty lang_html += f'{clean_lang}' lang_html += '
' st.markdown(lang_html, unsafe_allow_html=True) # Prepare download files try: # Get base filename from utils.general_utils import create_descriptive_filename original_file = result.get('file_name', 'document') base_name = create_descriptive_filename(original_file, result, "") base_name = os.path.splitext(base_name)[0] # 1. JSON download - with base64 data truncated for readability from utils.image_utils import truncate_base64_in_result truncated_result = truncate_base64_in_result(result) json_str = json.dumps(truncated_result, indent=2) json_filename = f"{base_name}.json" json_b64 = base64.b64encode(json_str.encode()).decode() # 2. Create ZIP with all files from utils.image_utils import create_results_zip_in_memory zip_data = create_results_zip_in_memory(result) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") zip_filename = f"{base_name}_{timestamp}.zip" zip_b64 = base64.b64encode(zip_data).decode() # Add download line with metadata styling download_html = '
' download_html += '
Download:
' # Download links in order of importance, matching the zip file contents download_html += f'JSON' # Zip download link (packages everything together) download_html += f'Zip Archive' download_html += '
' st.markdown(download_html, unsafe_allow_html=True) except Exception as e: # Silent fail for downloads - don't disrupt the UI pass # Create a separate line for Time if we have time-related tags if 'topics' in result and result['topics']: time_tags = [topic for topic in result['topics'] if any(term in topic.lower() for term in ["century", "pre-", "era"])] if time_tags: time_html = '
' time_html += '
Time:
' for tag in time_tags: time_html += f'{tag}' time_html += '
' st.markdown(time_html, unsafe_allow_html=True) # Then display remaining subject tags if available if 'topics' in result and result['topics']: # Filter out time-related tags which are already displayed subject_tags = [topic for topic in result['topics'] if not any(term in topic.lower() for term in ["century", "pre-", "era"])] if subject_tags: # Create a separate line for Subject Tags tags_html = '
' tags_html += '
Subject Tags:
' tags_html += '
' # Generate a badge for each remaining tag for topic in subject_tags: # Determine tag category class tag_class = "subject-tag" # Default class # Add specialized class based on category if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]): tag_class += " tag-language" # Languages elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]): tag_class += " tag-document-type" # Document types elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]): tag_class += " tag-subject" # Subject domains elif "historical" in topic.lower() and "document" in topic.lower(): tag_class += " tag-document-type" # "Historical Document Analysis" should be a document type # Add each tag as an inline span tags_html += f'{topic}' # Close the containers tags_html += '
' # Render the subject tags section st.markdown(tags_html, unsafe_allow_html=True) # Check if we have OCR content if 'ocr_contents' in result: # Create a single view instead of tabs content_tab1 = st.container() # Check for images in the result to use later has_images = result.get('has_images', False) has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', []))) has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and any('images' in page for page in result['raw_response_data']['pages'] if isinstance(page, dict))) # Display structured content with content_tab1: # Display structured content with markdown formatting if isinstance(result['ocr_contents'], dict): # CSS is now handled in the main layout.py file # Collect all available images from the result available_images = [] if has_images and 'pages_data' in result: for page_idx, page in enumerate(result['pages_data']): if 'images' in page and len(page['images']) > 0: for img_idx, img in enumerate(page['images']): if 'image_base64' in img: available_images.append({ 'source': 'pages_data', 'page': page_idx, 'index': img_idx, 'data': img['image_base64'] }) # Get images from raw response as well if 'raw_response_data' in result: raw_data = result['raw_response_data'] if isinstance(raw_data, dict) and 'pages' in raw_data: for page_idx, page in enumerate(raw_data['pages']): if isinstance(page, dict) and 'images' in page: for img_idx, img in enumerate(page['images']): if isinstance(img, dict) and 'base64' in img: available_images.append({ 'source': 'raw_response', 'page': page_idx, 'index': img_idx, 'data': img['base64'] }) # Extract images for display at the top images_to_display = [] # First, collect all available images for img_idx, img in enumerate(available_images): if 'data' in img: images_to_display.append({ 'data': img['data'], 'id': img.get('id', f"img_{img_idx}"), 'index': img_idx }) # Image display now only happens in the Images tab # Organize sections in a logical order - prioritize main_text section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"] ordered_sections = [] # Add known sections first in preferred order for section_name in section_order: if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]: ordered_sections.append(section_name) # Add any remaining sections for section in result['ocr_contents'].keys(): if (section not in ordered_sections and section not in ['error', 'partial_text'] and result['ocr_contents'][section]): ordered_sections.append(section) # If only raw_text is available and no other content, add it last if ('raw_text' in result['ocr_contents'] and result['ocr_contents']['raw_text'] and len(ordered_sections) == 0): ordered_sections.append('raw_text') # Add minimal spacing before OCR results st.markdown("
", unsafe_allow_html=True) # Create tabs for different views if has_images: tabs = st.tabs(["Document Content", "Raw JSON", "Images"]) doc_tab, json_tab, img_tab = tabs else: tabs = st.tabs(["Document Content", "Raw JSON"]) doc_tab, json_tab = tabs img_tab = None # Document Content tab with simple, clean formatting that matches markdown export files with doc_tab: # Create a single unified content section st.markdown("## Text Content") # Present content directly in the format used in markdown export files if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict): # Get all content fields that should be displayed content_fields = {} # Add all available content fields (left_page, right_page, etc) for field, content in result['ocr_contents'].items(): # Skip certain fields that shouldn't be displayed if field in ['error', 'partial_text'] or not content: continue # Clean the content if it's a string if isinstance(content, str) and content.strip(): content_fields[field] = content.strip() # Handle dictionary or list content elif isinstance(content, (dict, list)): formatted_content = format_structured_data(content) if formatted_content: content_fields[field] = formatted_content # Process nested dictionary structures def flatten_content_fields(fields, parent_key=""): flat_fields = {} for field, content in fields.items(): # Skip certain fields if field in ['error', 'partial_text'] or not content: continue # Handle string content if isinstance(content, str) and content.strip(): key = f"{parent_key}_{field}".strip("_") flat_fields[key] = content.strip() # Handle dictionary content elif isinstance(content, dict): # If the dictionary has a 'text' key, extract just that value if 'text' in content and isinstance(content['text'], str): key = f"{parent_key}_{field}".strip("_") flat_fields[key] = content['text'].strip() # Otherwise, recursively process nested dictionaries else: nested_fields = flatten_content_fields(content, f"{parent_key}_{field}") flat_fields.update(nested_fields) # Handle list content elif isinstance(content, list): formatted_content = format_structured_data(content) if formatted_content: key = f"{parent_key}_{field}".strip("_") flat_fields[key] = formatted_content return flat_fields # Flatten the content structure flat_content_fields = flatten_content_fields(result['ocr_contents']) # Display the flattened content fields with proper formatting for field, content in flat_content_fields.items(): # Skip any empty content if not content or not content.strip(): continue # Format field name as in the markdown export field_display = field.replace('_', ' ') # Maintain content purity - don't parse text content as JSON # Historical text may contain curly braces that aren't JSON # For raw_text field, display only the content without the field name if field == 'raw_text': st.markdown(f"{content}") else: # For other fields, display the field name in bold followed by the content st.markdown(f"**{field}:** {content}") # Add spacing between fields st.markdown("\n\n") # Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button with json_tab: # Use the same truncated JSON that's used in the download button from utils.image_utils import truncate_base64_in_result truncated_result = truncate_base64_in_result(result) # Format the JSON prettily json_str = json.dumps(truncated_result, indent=2) # Display JSON with a copy button using Streamlit's built-in functionality st.json(truncated_result) # Images tab - for viewing document images if has_images and img_tab: with img_tab: # Display each available image for i, img in enumerate(images_to_display): st.image(img['data'], caption=f"Image {i+1}", use_container_width=True) # Display custom prompt if provided if custom_prompt: with st.expander("Custom Processing Instructions"): st.write(custom_prompt)