File size: 10,753 Bytes
7647e70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import os
import hashlib
import tempfile
import streamlit as st
import logging
import time
from datetime import datetime
from pathlib import Path
from structured_ocr import StructuredOCR
from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
from preprocessing import apply_preprocessing_to_file
from error_handler import handle_ocr_error, check_file_size

# Configure logging
logger = logging.getLogger("ocr_processing")
logger.setLevel(logging.INFO)

@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None):
    """
    Cached version of OCR processing to reuse results
    
    Args:
        file_path: Path to the file to process
        file_type: Type of file (pdf or image)
        use_vision: Whether to use vision model
        file_size_mb: File size in MB
        cache_key: Cache key for the file
        preprocessing_options_hash: Hash of preprocessing options
        
    Returns:
        dict: OCR result
    """
    # Initialize OCR processor
    processor = StructuredOCR()
    
    # Process the file
    with timing(f"OCR processing of {file_type} file"):
        result = processor.process_file(
            file_path, 
            file_type=file_type, 
            use_vision=use_vision, 
            file_size_mb=file_size_mb
        )
    
    return result

def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_reporter=None, 
                 pdf_dpi=150, max_pages=3, pdf_rotation=0, custom_prompt=None, perf_mode="Quality"):
    """
    Process the uploaded file and return the OCR results
    
    Args:
        uploaded_file: The uploaded file to process
        use_vision: Whether to use vision model
        preprocessing_options: Dictionary of preprocessing options
        progress_reporter: ProgressReporter instance for UI updates
        pdf_dpi: DPI for PDF conversion
        max_pages: Maximum number of pages to process
        pdf_rotation: PDF rotation value
        custom_prompt: Custom prompt for OCR
        perf_mode: Performance mode (Quality or Speed)
        
    Returns:
        dict: OCR result
    """
    if preprocessing_options is None:
        preprocessing_options = {}
    
    # Create a container for progress indicators if not provided
    if progress_reporter is None:
        from ui_components import ProgressReporter
        progress_reporter = ProgressReporter(st.empty()).setup()
    
    # Initialize temporary file paths list
    temp_file_paths = []
    
    try:
        # Check if file size exceeds maximum allowed size
        is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
        if not is_valid:
            progress_reporter.complete(success=False)
            st.error(error_message)
            return {
                "file_name": uploaded_file.name,
                "topics": ["Document"],
                "languages": ["English"],
                "error": error_message,
                "ocr_contents": {
                    "error": error_message,
                    "partial_text": "Document could not be processed due to size limitations."
                }
            }
        
        # Update progress
        progress_reporter.update(10, "Initializing OCR processor...")
        
        # Determine file type from extension
        file_ext = Path(uploaded_file.name).suffix.lower()
        file_type = "pdf" if file_ext == ".pdf" else "image"
        file_bytes = uploaded_file.getvalue()
        
        # For PDFs, we need to handle differently
        if file_type == "pdf":
            progress_reporter.update(20, "Converting PDF to images...")
            
            # Process PDF with direct handling
            progress_reporter.update(30, "Processing PDF with OCR...")
            
            # Create a temporary file for processing
            temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
            with open(temp_path, 'wb') as f:
                f.write(file_bytes)
            temp_file_paths.append(temp_path)
            
            # Generate cache key
            cache_key = generate_cache_key(
                file_bytes, 
                file_type, 
                use_vision, 
                preprocessing_options, 
                pdf_rotation, 
                custom_prompt
            )
            
            # Process with cached function if possible
            try:
                result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
                progress_reporter.update(90, "Finalizing results...")
            except Exception as e:
                logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
                progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
                
                # If caching fails, process directly
                processor = StructuredOCR()
                
                # Apply performance mode settings
                if perf_mode == "Speed":
                    # Override settings for faster processing
                    if pdf_dpi > 100:
                        pdf_dpi = 100  # Lower DPI for speed
                
                # Process directly with optimized settings
                result = processor.process_file(
                    file_path=temp_path,
                    file_type="pdf",
                    use_vision=use_vision,
                    custom_prompt=custom_prompt,
                    file_size_mb=file_size_mb,
                    pdf_rotation=pdf_rotation
                )
                
                progress_reporter.update(90, "Finalizing results...")
        else:
            # For image files
            progress_reporter.update(20, "Preparing image for processing...")
            
            # Apply preprocessing if needed
            temp_path, preprocessing_applied = apply_preprocessing_to_file(
                file_bytes, 
                file_ext, 
                preprocessing_options, 
                temp_file_paths
            )
            
            if preprocessing_applied:
                progress_reporter.update(30, "Applied image preprocessing...")
            
            # Generate cache key
            cache_key = generate_cache_key(
                open(temp_path, 'rb').read(), 
                file_type, 
                use_vision, 
                preprocessing_options, 
                0,  # No rotation for images (handled in preprocessing)
                custom_prompt
            )
            
            # Process the file using cached function if possible
            progress_reporter.update(50, "Processing document with OCR...")
            try:
                result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options))
                progress_reporter.update(80, "Analyzing document structure...")
                progress_reporter.update(90, "Finalizing results...")
            except Exception as e:
                logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
                progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
                
                # If caching fails, process directly
                processor = StructuredOCR()
                
                # Apply performance mode settings
                if perf_mode == "Speed":
                    # Use simpler processing for speed
                    pass  # Any speed optimizations would be handled by the StructuredOCR class
                
                result = processor.process_file(
                    file_path=temp_path,
                    file_type=file_type,
                    use_vision=use_vision,
                    custom_prompt=custom_prompt,
                    file_size_mb=file_size_mb
                )
                
                progress_reporter.update(90, "Finalizing results...")
        
        # Add additional metadata to result
        result = process_result(result, uploaded_file, preprocessing_options)
        
        # Complete progress
        progress_reporter.complete()
        
        return result
    except Exception as e:
        # Handle errors
        error_message = handle_ocr_error(e, progress_reporter)
        
        # Return error result
        return {
            "file_name": uploaded_file.name,
            "topics": ["Document"],
            "languages": ["English"],
            "error": error_message,
            "ocr_contents": {
                "error": f"Failed to process file: {error_message}",
                "partial_text": "Document could not be processed due to an error."
            }
        }
    finally:
        # Clean up temporary files
        for temp_path in temp_file_paths:
            try:
                if os.path.exists(temp_path):
                    os.unlink(temp_path)
                    logger.info(f"Removed temporary file: {temp_path}")
            except Exception as e:
                logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")

def process_result(result, uploaded_file, preprocessing_options=None):
    """
    Process OCR result to add metadata, tags, etc.
    
    Args:
        result: OCR result dictionary
        uploaded_file: The uploaded file
        preprocessing_options: Dictionary of preprocessing options
        
    Returns:
        dict: Processed OCR result
    """
    # Add timestamp
    result['timestamp'] = format_timestamp()
    
    # Add processing time if not already present
    if 'processing_time' not in result:
        result['processing_time'] = 0.0
    
    # Generate descriptive filename
    file_ext = Path(uploaded_file.name).suffix.lower()
    result['descriptive_file_name'] = create_descriptive_filename(
        uploaded_file.name, 
        result, 
        file_ext, 
        preprocessing_options
    )
    
    # Extract raw text from OCR contents
    raw_text = ""
    if 'ocr_contents' in result:
        if 'raw_text' in result['ocr_contents']:
            raw_text = result['ocr_contents']['raw_text']
        elif 'content' in result['ocr_contents']:
            raw_text = result['ocr_contents']['content']
    
    # Extract subject tags if not already present or enhance existing ones
    if 'topics' not in result or not result['topics']:
        result['topics'] = extract_subject_tags(result, raw_text, preprocessing_options)
    
    return result