File size: 20,437 Bytes
c04ffe5
 
 
42dc069
c04ffe5
 
 
 
 
 
42dc069
c04ffe5
 
 
 
 
 
 
 
 
 
 
 
3030658
c04ffe5
 
 
 
 
 
 
 
 
 
 
 
 
3030658
 
 
 
 
 
 
 
c04ffe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42dc069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3030658
42dc069
 
 
 
 
 
 
c04ffe5
 
 
 
 
3030658
c04ffe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3030658
 
c04ffe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42dc069
c04ffe5
 
42dc069
c04ffe5
42dc069
 
 
 
 
 
 
 
 
 
 
 
c04ffe5
42dc069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c04ffe5
42dc069
 
 
 
 
 
 
 
 
c04ffe5
42dc069
 
c04ffe5
42dc069
c04ffe5
42dc069
 
 
c04ffe5
 
42dc069
c04ffe5
42dc069
 
c04ffe5
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
"""
UI utilities for OCR results display.
"""
import os
import streamlit as st
import json
import base64
import io
from datetime import datetime

from utils.text_utils import format_ocr_text
from utils.content_utils import classify_document_content, format_structured_data

def display_results(result, container, custom_prompt=""):
    """Display OCR results in the provided container"""
    with container:
        # Add heading for document metadata
        st.markdown("### Document Metadata")
        
        # Filter out large data structures from metadata display
        meta = {k: v for k, v in result.items()
                if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']}
        
        # Create a compact metadata section for primary metadata
        meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">'
        
        # Document type
        if 'detected_document_type' in meta:
            meta_html += f'<div><strong>Type:</strong> {meta["detected_document_type"]}</div>'
        
        # Page information
        if 'limited_pages' in meta:
            meta_html += f'<div><strong>Pages:</strong> {meta["limited_pages"]["processed"]}/{meta["limited_pages"]["total"]}</div>'
            
        meta_html += '</div>'
        st.markdown(meta_html, unsafe_allow_html=True)
        
        # Processing time - separate section for proper ordering of all metadata fields
        if 'processing_time' in meta:
            time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
            time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
            time_html += f'<div>{meta["processing_time"]:.1f}s</div>'
            time_html += '</div>'
            st.markdown(time_html, unsafe_allow_html=True)
        
        # Language metadata on a separate line, Subject Tags below
        
        # First show languages if available
        if 'languages' in result and result['languages']:
            languages = [lang for lang in result['languages'] if lang is not None]
            if languages:
                # Create a dedicated line for Languages
                lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
                lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>'
                
                # Add language tags
                for lang in languages:
                    # Clean language name if needed
                    clean_lang = str(lang).strip()
                    if clean_lang:  # Only add if not empty
                        lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>'
                
                lang_html += '</div>'
                st.markdown(lang_html, unsafe_allow_html=True)
                
        # Prepare download files
        try:
            # Get base filename
            from utils.general_utils import create_descriptive_filename
            original_file = result.get('file_name', 'document')
            base_name = create_descriptive_filename(original_file, result, "")
            base_name = os.path.splitext(base_name)[0]
            
            # 1. JSON download - with base64 data truncated for readability
            from utils.image_utils import truncate_base64_in_result
            truncated_result = truncate_base64_in_result(result)
            json_str = json.dumps(truncated_result, indent=2)
            json_filename = f"{base_name}.json"
            json_b64 = base64.b64encode(json_str.encode()).decode()
            
            # 2. Create ZIP with all files
            from utils.image_utils import create_results_zip_in_memory
            zip_data = create_results_zip_in_memory(result)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            zip_filename = f"{base_name}_{timestamp}.zip"
            zip_b64 = base64.b64encode(zip_data).decode()
            
            # Add download line with metadata styling
            download_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
            download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>'
            
            # Download links in order of importance, matching the zip file contents
            download_html += f'<a href="data:application/json;base64,{json_b64}" download="{json_filename}" class="subject-tag tag-download">JSON</a>'
            
            # Zip download link (packages everything together)
            download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">Zip Archive</a>'
            
            download_html += '</div>'
            st.markdown(download_html, unsafe_allow_html=True)
        except Exception as e:
            # Silent fail for downloads - don't disrupt the UI
            pass
        
        # Create a separate line for Time if we have time-related tags
        if 'topics' in result and result['topics']:
            time_tags = [topic for topic in result['topics'] 
                       if any(term in topic.lower() for term in ["century", "pre-", "era"])]
            if time_tags:
                time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
                time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>'
                for tag in time_tags:
                    time_html += f'<span class="subject-tag tag-time-period">{tag}</span>'
                time_html += '</div>'
                st.markdown(time_html, unsafe_allow_html=True)
        
        # Then display remaining subject tags if available
        if 'topics' in result and result['topics']:
            # Filter out time-related tags which are already displayed
            subject_tags = [topic for topic in result['topics'] 
                         if not any(term in topic.lower() for term in ["century", "pre-", "era"])]
            
            if subject_tags:
                # Create a separate line for Subject Tags
                tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">'
                tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>'
                tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">'
                
                # Generate a badge for each remaining tag
                for topic in subject_tags:
                    # Determine tag category class
                    tag_class = "subject-tag"  # Default class
                    
                    # Add specialized class based on category
                    if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
                        tag_class += " tag-language"  # Languages
                    elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
                        tag_class += " tag-document-type"  # Document types
                    elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
                        tag_class += " tag-subject"  # Subject domains
                    elif "historical" in topic.lower() and "document" in topic.lower():
                        tag_class += " tag-document-type"  # "Historical Document Analysis" should be a document type
                    
                    # Add each tag as an inline span
                    tags_html += f'<span class="{tag_class}">{topic}</span>'
                
                # Close the containers
                tags_html += '</div></div>'
                
                # Render the subject tags section
                st.markdown(tags_html, unsafe_allow_html=True)
                        
            # Check if we have OCR content
            if 'ocr_contents' in result:
                # Create a single view instead of tabs
                content_tab1 = st.container()
                
                # Check for images in the result to use later
                has_images = result.get('has_images', False)
                has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
                has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and 
                              any('images' in page for page in result['raw_response_data']['pages'] 
                                  if isinstance(page, dict)))
            
            # Display structured content
            with content_tab1:
                # Display structured content with markdown formatting
                if isinstance(result['ocr_contents'], dict):
                    # CSS is now handled in the main layout.py file
                    
                    # Collect all available images from the result
                    available_images = []
                    if has_images and 'pages_data' in result:
                        for page_idx, page in enumerate(result['pages_data']):
                            if 'images' in page and len(page['images']) > 0:
                                for img_idx, img in enumerate(page['images']):
                                    if 'image_base64' in img:
                                        available_images.append({
                                            'source': 'pages_data',
                                            'page': page_idx,
                                            'index': img_idx,
                                            'data': img['image_base64']
                                        })
                    
                    # Get images from raw response as well
                    if 'raw_response_data' in result:
                        raw_data = result['raw_response_data']
                        if isinstance(raw_data, dict) and 'pages' in raw_data:
                            for page_idx, page in enumerate(raw_data['pages']):
                                if isinstance(page, dict) and 'images' in page:
                                    for img_idx, img in enumerate(page['images']):
                                        if isinstance(img, dict) and 'base64' in img:
                                            available_images.append({
                                                'source': 'raw_response',
                                                'page': page_idx,
                                                'index': img_idx,
                                                'data': img['base64']
                                            })
                    
                    # Extract images for display at the top
                    images_to_display = []
                    
                    # First, collect all available images
                    for img_idx, img in enumerate(available_images):
                        if 'data' in img:
                            images_to_display.append({
                                'data': img['data'],
                                'id': img.get('id', f"img_{img_idx}"),
                                'index': img_idx
                            })
                    
                    # Image display now only happens in the Images tab
                    
                    # Organize sections in a logical order - prioritize main_text
                    section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"]
                    ordered_sections = []
                    
                    # Add known sections first in preferred order
                    for section_name in section_order:
                        if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
                            ordered_sections.append(section_name)
                    
                    # Add any remaining sections
                    for section in result['ocr_contents'].keys():
                        if (section not in ordered_sections and 
                            section not in ['error', 'partial_text'] and 
                            result['ocr_contents'][section]):
                            ordered_sections.append(section)
                            
                    # If only raw_text is available and no other content, add it last
                    if ('raw_text' in result['ocr_contents'] and 
                        result['ocr_contents']['raw_text'] and 
                        len(ordered_sections) == 0):
                        ordered_sections.append('raw_text')
                    
                    # Add minimal spacing before OCR results
                    st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True)
                    
                    # Create tabs for different views
                    if has_images:
                        tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
                        doc_tab, json_tab, img_tab = tabs
                    else:
                        tabs = st.tabs(["Document Content", "Raw JSON"])
                        doc_tab, json_tab = tabs
                        img_tab = None
                    
                    # Document Content tab with simple, clean formatting that matches markdown export files
                    with doc_tab:
                        # Create a single unified content section
                        st.markdown("## Text Content")
                        
                        # Present content directly in the format used in markdown export files
                        if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict):
                            # Get all content fields that should be displayed
                            content_fields = {}
                            
                            # Add all available content fields (left_page, right_page, etc)
                            for field, content in result['ocr_contents'].items():
                                # Skip certain fields that shouldn't be displayed
                                if field in ['error', 'partial_text'] or not content:
                                    continue
                                    
                                # Clean the content if it's a string
                                if isinstance(content, str) and content.strip():
                                    content_fields[field] = content.strip()
                                # Handle dictionary or list content
                                elif isinstance(content, (dict, list)):
                                    formatted_content = format_structured_data(content)
                                    if formatted_content:
                                        content_fields[field] = formatted_content
                            
                            # Process nested dictionary structures
                            def flatten_content_fields(fields, parent_key=""):
                                flat_fields = {}
                                for field, content in fields.items():
                                    # Skip certain fields
                                    if field in ['error', 'partial_text'] or not content:
                                        continue
                                        
                                    # Handle string content
                                    if isinstance(content, str) and content.strip():
                                        key = f"{parent_key}_{field}".strip("_")
                                        flat_fields[key] = content.strip()
                                    # Handle dictionary content
                                    elif isinstance(content, dict):
                                        # If the dictionary has a 'text' key, extract just that value
                                        if 'text' in content and isinstance(content['text'], str):
                                            key = f"{parent_key}_{field}".strip("_")
                                            flat_fields[key] = content['text'].strip()
                                        # Otherwise, recursively process nested dictionaries
                                        else:
                                            nested_fields = flatten_content_fields(content, f"{parent_key}_{field}")
                                            flat_fields.update(nested_fields)
                                    # Handle list content
                                    elif isinstance(content, list):
                                        formatted_content = format_structured_data(content)
                                        if formatted_content:
                                            key = f"{parent_key}_{field}".strip("_")
                                            flat_fields[key] = formatted_content
                                            
                                return flat_fields
                            
                            # Flatten the content structure
                            flat_content_fields = flatten_content_fields(result['ocr_contents'])
                            
                            # Display the flattened content fields with proper formatting
                            for field, content in flat_content_fields.items():
                                # Skip any empty content
                                if not content or not content.strip():
                                    continue
                                    
                                # Format field name as in the markdown export
                                field_display = field.replace('_', ' ')
                                
                                # Maintain content purity - don't parse text content as JSON
                                # Historical text may contain curly braces that aren't JSON
                                
                                # For raw_text field, display only the content without the field name
                                if field == 'raw_text':
                                    st.markdown(f"{content}")
                                else:
                                    # For other fields, display the field name in bold followed by the content
                                    st.markdown(f"**{field}:** {content}")
                                
                                # Add spacing between fields
                                st.markdown("\n\n")
                    
                    # Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button
                    with json_tab:
                        # Use the same truncated JSON that's used in the download button
                        from utils.image_utils import truncate_base64_in_result
                        truncated_result = truncate_base64_in_result(result)
                        
                        # Format the JSON prettily
                        json_str = json.dumps(truncated_result, indent=2)
                        
                        # Display JSON with a copy button using Streamlit's built-in functionality
                        st.json(truncated_result)
                        
                    
                    # Images tab - for viewing document images
                    if has_images and img_tab:
                        with img_tab:
                            # Display each available image
                            for i, img in enumerate(images_to_display):
                                st.image(img['data'], caption=f"Image {i+1}", use_container_width=True)
            
            # Display custom prompt if provided
            if custom_prompt:
                with st.expander("Custom Processing Instructions"):
                    st.write(custom_prompt)