File size: 10,224 Bytes
7647e70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import os
import base64
import hashlib
import time
import logging
from datetime import datetime
from pathlib import Path
from functools import wraps
from constants import CONTENT_THEMES, PERIOD_TAGS, DEFAULT_TAGS, GENERIC_TAGS

# Configure logging
logger = logging.getLogger("utils")
logger.setLevel(logging.INFO)

def get_base64_from_image(image_path):
    """Get base64 string from image file"""
    try:
        with open(image_path, "rb") as img_file:
            return base64.b64encode(img_file.read()).decode('utf-8')
    except Exception as e:
        logger.error(f"Error encoding image to base64: {str(e)}")
        return ""

def timing(description):
    """Context manager for timing code execution"""
    class TimingContext:
        def __init__(self, description):
            self.description = description
            
        def __enter__(self):
            self.start_time = time.time()
            return self
            
        def __exit__(self, exc_type, exc_val, exc_tb):
            end_time = time.time()
            execution_time = end_time - self.start_time
            logger.info(f"{self.description} took {execution_time:.2f} seconds")
            return False
    
    return TimingContext(description)

def format_timestamp(timestamp=None):
    """Format timestamp for display"""
    if timestamp is None:
        timestamp = datetime.now()
    elif isinstance(timestamp, str):
        try:
            timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
        except ValueError:
            timestamp = datetime.now()
    
    return timestamp.strftime("%Y-%m-%d %H:%M")

def generate_cache_key(file_bytes, file_type, use_vision, preprocessing_options=None, pdf_rotation=0, custom_prompt=None):
    """
    Generate a cache key for OCR processing
    
    Args:
        file_bytes: File content as bytes
        file_type: Type of file (pdf or image)
        use_vision: Whether to use vision model
        preprocessing_options: Dictionary of preprocessing options
        pdf_rotation: PDF rotation value
        custom_prompt: Custom prompt for OCR
        
    Returns:
        str: Cache key
    """
    # Generate file hash
    file_hash = hashlib.md5(file_bytes).hexdigest()
    
    # Include preprocessing options in cache key
    preprocessing_options_hash = ""
    if preprocessing_options:
        # Add pdf_rotation to preprocessing options to ensure it's part of the cache key
        if pdf_rotation != 0:
            preprocessing_options_with_rotation = preprocessing_options.copy()
            preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation
            preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
        else:
            preprocessing_str = str(sorted(preprocessing_options.items()))
        preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
    elif pdf_rotation != 0:
        # If no preprocessing options but we have rotation, include that in the hash
        preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation}".encode()).hexdigest()
    
    # Create base cache key
    cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
    
    # Include custom prompt in cache key if provided
    if custom_prompt:
        custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
        cache_key = f"{cache_key}_{custom_prompt_hash}"
    
    return cache_key

def handle_temp_files(temp_file_paths):
    """
    Clean up temporary files
    
    Args:
        temp_file_paths: List of temporary file paths to clean up
    """
    for temp_path in temp_file_paths:
        try:
            if os.path.exists(temp_path):
                os.unlink(temp_path)
                logger.info(f"Removed temporary file: {temp_path}")
        except Exception as e:
            logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")

def create_descriptive_filename(original_filename, result, file_ext, preprocessing_options=None):
    """
    Create a descriptive filename for the result
    
    Args:
        original_filename: Original filename
        result: OCR result dictionary
        file_ext: File extension
        preprocessing_options: Dictionary of preprocessing options
        
    Returns:
        str: Descriptive filename
    """
    # Get base name without extension
    original_name = Path(original_filename).stem
    
    # Add document type to filename if detected
    doc_type_tag = ""
    if 'detected_document_type' in result:
        doc_type = result['detected_document_type'].lower()
        doc_type_tag = f"_{doc_type.replace(' ', '_')}"
    elif 'topics' in result and result['topics']:
        # Use first tag as document type if not explicitly detected
        doc_type_tag = f"_{result['topics'][0].lower().replace(' ', '_')}"
    
    # Add period tag for historical context if available
    period_tag = ""
    if 'topics' in result and result['topics']:
        for tag in result['topics']:
            if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
                period_tag = f"_{tag.lower().replace(' ', '_')}"
                break
    
    # Generate final descriptive filename
    descriptive_name = f"{original_name}{doc_type_tag}{period_tag}{file_ext}"
    return descriptive_name

def extract_subject_tags(result, raw_text, preprocessing_options=None):
    """
    Extract subject tags from OCR result
    
    Args:
        result: OCR result dictionary
        raw_text: Raw text from OCR
        preprocessing_options: Dictionary of preprocessing options
        
    Returns:
        list: Subject tags
    """
    subject_tags = []
    
    try:
        # Use existing topics as starting point if available
        if 'topics' in result and result['topics']:
            subject_tags = list(result['topics'])
        
        # Add document type if detected
        if 'detected_document_type' in result:
            doc_type = result['detected_document_type'].capitalize()
            if doc_type not in subject_tags:
                subject_tags.append(doc_type)
        
        # Analyze content for common themes based on keywords
        if raw_text:
            raw_text_lower = raw_text.lower()
            for theme, keywords in CONTENT_THEMES.items():
                if any(keyword in raw_text_lower for keyword in keywords):
                    if theme not in subject_tags:
                        subject_tags.append(theme)
        
        # Add document period tag if date patterns are detected
        if raw_text:
            # Look for years in content
            import re
            year_matches = re.findall(r'\b1[0-9]{3}\b|\b20[0-1][0-9]\b', raw_text)
            if year_matches:
                # Convert to integers
                years = [int(y) for y in year_matches]
                # Get earliest year
                earliest = min(years)
                
                # Find the period tag for this year
                for year_range, period_tag in PERIOD_TAGS.items():
                    if year_range[0] <= earliest <= year_range[1]:
                        if period_tag not in subject_tags:
                            subject_tags.append(period_tag)
                        break
        
        # Add languages as topics if available
        if 'languages' in result and result['languages']:
            for lang in result['languages']:
                if lang and lang not in subject_tags:
                    lang_tag = f"{lang} Language"
                    subject_tags.append(lang_tag)
        
        # Add preprocessing information as tags if preprocessing was applied
        if preprocessing_options:
            preprocessing_methods = []
            if preprocessing_options.get("document_type", "standard") != "standard":
                doc_type = preprocessing_options["document_type"].capitalize()
                preprocessing_tag = f"Enhanced ({doc_type})"
                if preprocessing_tag not in subject_tags:
                    subject_tags.append(preprocessing_tag)
            
            if preprocessing_options.get("grayscale", False):
                preprocessing_methods.append("Grayscale")
            if preprocessing_options.get("denoise", False):
                preprocessing_methods.append("Denoised")
            if preprocessing_options.get("contrast", 0) != 0:
                contrast_val = preprocessing_options.get("contrast", 0)
                if contrast_val > 0:
                    preprocessing_methods.append("Contrast Enhanced")
                else:
                    preprocessing_methods.append("Contrast Reduced")
            if preprocessing_options.get("rotation", 0) != 0:
                preprocessing_methods.append("Rotated")
            
            # Add a combined preprocessing tag if methods were applied
            if preprocessing_methods:
                prep_tag = "Preprocessed"
                if prep_tag not in subject_tags:
                    subject_tags.append(prep_tag)
                
                # Add the specific method as a tag if only one was used
                if len(preprocessing_methods) == 1:
                    method_tag = preprocessing_methods[0]
                    if method_tag not in subject_tags:
                        subject_tags.append(method_tag)
    
    except Exception as e:
        logger.warning(f"Error generating subject tags: {str(e)}")
        # Fallback tags if extraction fails
        if not subject_tags:
            subject_tags = DEFAULT_TAGS.copy()
    
    # Ensure we have at least 3 tags
    while len(subject_tags) < 3:
        for tag in DEFAULT_TAGS:
            if tag not in subject_tags:
                subject_tags.append(tag)
                break
        else:
            # If all default tags are already used, add generic ones
            for tag in GENERIC_TAGS:
                if tag not in subject_tags:
                    subject_tags.append(tag)
                    break
            else:
                # If we still can't add any more tags, break the loop
                break
    
    return subject_tags