Spaces:

milwright
/

historical-ocr

Running

File size: 20,437 Bytes

"""
UI utilities for OCR results display.
"""
import os
import streamlit as st
import json
import base64
import io
from datetime import datetime

from utils.text_utils import format_ocr_text
from utils.content_utils import classify_document_content, format_structured_data

def display_results(result, container, custom_prompt=""):
    """Display OCR results in the provided container"""
    with container:
        # Add heading for document metadata
        st.markdown("### Document Metadata")
        
        # Filter out large data structures from metadata display
        meta = {k: v for k, v in result.items()
                if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']}
        
        # Create a compact metadata section for primary metadata
        meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">'
        
        # Document type
        if 'detected_document_type' in meta:
            meta_html += f'<div><strong>Type:</strong> {meta["detected_document_type"]}</div>'
        
        # Page information
        if 'limited_pages' in meta:
            meta_html += f'<div><strong>Pages:</strong> {meta["limited_pages"]["processed"]}/{meta["limited_pages"]["total"]}</div>'
            
        meta_html += '</div>'
        st.markdown(meta_html, unsafe_allow_html=True)
        
        # Processing time - separate section for proper ordering of all metadata fields
        if 'processing_time' in meta:
            time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
            time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
            time_html += f'<div>{meta["processing_time"]:.1f}s</div>'
            time_html += '</div>'
            st.markdown(time_html, unsafe_allow_html=True)
        
        # Language metadata on a separate line, Subject Tags below
        
        # First show languages if available
        if 'languages' in result and result['languages']:
            languages = [lang for lang in result['languages'] if lang is not None]
            if languages:
                # Create a dedicated line for Languages
                lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
                lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>'
                
                # Add language tags
                for lang in languages:
                    # Clean language name if needed
                    clean_lang = str(lang).strip()
                    if clean_lang:  # Only add if not empty
                        lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>'
                
                lang_html += '</div>'
                st.markdown(lang_html, unsafe_allow_html=True)
                
        # Prepare download files
        try:
            # Get base filename
            from utils.general_utils import create_descriptive_filename
            original_file = result.get('file_name', 'document')
            base_name = create_descriptive_filename(original_file, result, "")
            base_name = os.path.splitext(base_name)[0]
            
            # 1. JSON download - with base64 data truncated for readability
            from utils.image_utils import truncate_base64_in_result
            truncated_result = truncate_base64_in_result(result)
            json_str = json.dumps(truncated_result, indent=2)
            json_filename = f"{base_name}.json"
            json_b64 = base64.b64encode(json_str.encode()).decode()
            
            # 2. Create ZIP with all files
            from utils.image_utils import create_results_zip_in_memory
            zip_data = create_results_zip_in_memory(result)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            zip_filename = f"{base_name}_{timestamp}.zip"
            zip_b64 = base64.b64encode(zip_data).decode()
            
            # Add download line with metadata styling
            download_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
            download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>'
            
            # Download links in order of importance, matching the zip file contents
            download_html += f'<a href="data:application/json;base64,{json_b64}" download="{json_filename}" class="subject-tag tag-download">JSON</a>'
            
            # Zip download link (packages everything together)
            download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">Zip Archive</a>'
            
            download_html += '</div>'
            st.markdown(download_html, unsafe_allow_html=True)
        except Exception as e:
            # Silent fail for downloads - don't disrupt the UI
            pass
        
        # Create a separate line for Time if we have time-related tags
        if 'topics' in result and result['topics']:
            time_tags = [topic for topic in result['topics'] 
                       if any(term in topic.lower() for term in ["century", "pre-", "era"])]
            if time_tags:
                time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
                time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
                for tag in time_tags:
                    time_html += f'<span class="subject-tag tag-time-period">{tag}</span>'
                time_html += '</div>'
                st.markdown(time_html, unsafe_allow_html=True)
        
        # Then display remaining subject tags if available
        if 'topics' in result and result['topics']:
            # Filter out time-related tags which are already displayed
            subject_tags = [topic for topic in result['topics'] 
                         if not any(term in topic.lower() for term in ["century", "pre-", "era"])]
            
            if subject_tags:
                # Create a separate line for Subject Tags
                tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
                tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>'
                tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">'
                
                # Generate a badge for each remaining tag
                for topic in subject_tags:
                    # Determine tag category class
                    tag_class = "subject-tag"  # Default class
                    
                    # Add specialized class based on category
                    if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
                        tag_class += " tag-language"  # Languages
                    elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
                        tag_class += " tag-document-type"  # Document types
                    elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
                        tag_class += " tag-subject"  # Subject domains
                    elif "historical" in topic.lower() and "document" in topic.lower():
                        tag_class += " tag-document-type"  # "Historical Document Analysis" should be a document type
                    
                    # Add each tag as an inline span
                    tags_html += f'<span class="{tag_class}">{topic}</span>'
                
                # Close the containers
                tags_html += '</div></div>'
                
                # Render the subject tags section
                st.markdown(tags_html, unsafe_allow_html=True)
                        
            # Check if we have OCR content
            if 'ocr_contents' in result:
                # Create a single view instead of tabs
                content_tab1 = st.container()
                
                # Check for images in the result to use later
                has_images = result.get('has_images', False)
                has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
                has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and 
                              any('images' in page for page in result['raw_response_data']['pages'] 
                                  if isinstance(page, dict)))
            
            # Display structured content
            with content_tab1:
                # Display structured content with markdown formatting
                if isinstance(result['ocr_contents'], dict):
                    # CSS is now handled in the main layout.py file
                    
                    # Collect all available images from the result
                    available_images = []
                    if has_images and 'pages_data' in result:
                        for page_idx, page in enumerate(result['pages_data']):
                            if 'images' in page and len(page['images']) > 0:
                                for img_idx, img in enumerate(page['images']):
                                    if 'image_base64' in img:
                                        available_images.append({
                                            'source': 'pages_data',
                                            'page': page_idx,
                                            'index': img_idx,
                                            'data': img['image_base64']
                                        })
                    
                    # Get images from raw response as well
                    if 'raw_response_data' in result:
                        raw_data = result['raw_response_data']
                        if isinstance(raw_data, dict) and 'pages' in raw_data:
                            for page_idx, page in enumerate(raw_data['pages']):
                                if isinstance(page, dict) and 'images' in page:
                                    for img_idx, img in enumerate(page['images']):
                                        if isinstance(img, dict) and 'base64' in img:
                                            available_images.append({
                                                'source': 'raw_response',
                                                'page': page_idx,
                                                'index': img_idx,
                                                'data': img['base64']
                                            })
                    
                    # Extract images for display at the top
                    images_to_display = []
                    
                    # First, collect all available images
                    for img_idx, img in enumerate(available_images):
                        if 'data' in img:
                            images_to_display.append({
                                'data': img['data'],
                                'id': img.get('id', f"img_{img_idx}"),
                                'index': img_idx
                            })
                    
                    # Image display now only happens in the Images tab
                    
                    # Organize sections in a logical order - prioritize main_text
                    section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"]
                    ordered_sections = []
                    
                    # Add known sections first in preferred order
                    for section_name in section_order:
                        if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
                            ordered_sections.append(section_name)
                    
                    # Add any remaining sections
                    for section in result['ocr_contents'].keys():
                        if (section not in ordered_sections and 
                            section not in ['error', 'partial_text'] and 
                            result['ocr_contents'][section]):
                            ordered_sections.append(section)
                            
                    # If only raw_text is available and no other content, add it last
                    if ('raw_text' in result['ocr_contents'] and 
                        result['ocr_contents']['raw_text'] and 
                        len(ordered_sections) == 0):
                        ordered_sections.append('raw_text')
                    
                    # Add minimal spacing before OCR results
                    st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True)
                    
                    # Create tabs for different views
                    if has_images:
                        tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
                        doc_tab, json_tab, img_tab = tabs
                    else:
                        tabs = st.tabs(["Document Content", "Raw JSON"])
                        doc_tab, json_tab = tabs
                        img_tab = None
                    
                    # Document Content tab with simple, clean formatting that matches markdown export files
                    with doc_tab:
                        # Create a single unified content section
                        st.markdown("## Text Content")
                        
                        # Present content directly in the format used in markdown export files
                        if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict):
                            # Get all content fields that should be displayed
                            content_fields = {}
                            
                            # Add all available content fields (left_page, right_page, etc)
                            for field, content in result['ocr_contents'].items():
                                # Skip certain fields that shouldn't be displayed
                                if field in ['error', 'partial_text'] or not content:
                                    continue
                                    
                                # Clean the content if it's a string
                                if isinstance(content, str) and content.strip():
                                    content_fields[field] = content.strip()
                                # Handle dictionary or list content
                                elif isinstance(content, (dict, list)):
                                    formatted_content = format_structured_data(content)
                                    if formatted_content:
                                        content_fields[field] = formatted_content
                            
                            # Process nested dictionary structures
                            def flatten_content_fields(fields, parent_key=""):
                                flat_fields = {}
                                for field, content in fields.items():
                                    # Skip certain fields
                                    if field in ['error', 'partial_text'] or not content:
                                        continue
                                        
                                    # Handle string content
                                    if isinstance(content, str) and content.strip():
                                        key = f"{parent_key}_{field}".strip("_")
                                        flat_fields[key] = content.strip()
                                    # Handle dictionary content
                                    elif isinstance(content, dict):
                                        # If the dictionary has a 'text' key, extract just that value
                                        if 'text' in content and isinstance(content['text'], str):
                                            key = f"{parent_key}_{field}".strip("_")
                                            flat_fields[key] = content['text'].strip()
                                        # Otherwise, recursively process nested dictionaries
                                        else:
                                            nested_fields = flatten_content_fields(content, f"{parent_key}_{field}")
                                            flat_fields.update(nested_fields)
                                    # Handle list content
                                    elif isinstance(content, list):
                                        formatted_content = format_structured_data(content)
                                        if formatted_content:
                                            key = f"{parent_key}_{field}".strip("_")
                                            flat_fields[key] = formatted_content
                                            
                                return flat_fields
                            
                            # Flatten the content structure
                            flat_content_fields = flatten_content_fields(result['ocr_contents'])
                            
                            # Display the flattened content fields with proper formatting
                            for field, content in flat_content_fields.items():
                                # Skip any empty content
                                if not content or not content.strip():
                                    continue
                                    
                                # Format field name as in the markdown export
                                field_display = field.replace('_', ' ')
                                
                                # Maintain content purity - don't parse text content as JSON
                                # Historical text may contain curly braces that aren't JSON
                                
                                # For raw_text field, display only the content without the field name
                                if field == 'raw_text':
                                    st.markdown(f"{content}")
                                else:
                                    # For other fields, display the field name in bold followed by the content
                                    st.markdown(f"**{field}:** {content}")
                                
                                # Add spacing between fields
                                st.markdown("\n\n")
                    
                    # Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button
                    with json_tab:
                        # Use the same truncated JSON that's used in the download button
                        from utils.image_utils import truncate_base64_in_result
                        truncated_result = truncate_base64_in_result(result)
                        
                        # Format the JSON prettily
                        json_str = json.dumps(truncated_result, indent=2)
                        
                        # Display JSON with a copy button using Streamlit's built-in functionality
                        st.json(truncated_result)
                        
                    
                    # Images tab - for viewing document images
                    if has_images and img_tab:
                        with img_tab:
                            # Display each available image
                            for i, img in enumerate(images_to_display):
                                st.image(img['data'], caption=f"Image {i+1}", use_container_width=True)
            
            # Display custom prompt if provided
            if custom_prompt:
                with st.expander("Custom Processing Instructions"):
                    st.write(custom_prompt)