import os import hashlib import tempfile import streamlit as st import logging import time from datetime import datetime from pathlib import Path from structured_ocr import StructuredOCR from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags from preprocessing import apply_preprocessing_to_file from error_handler import handle_ocr_error, check_file_size # Configure logging logger = logging.getLogger("ocr_processing") logger.setLevel(logging.INFO) @st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False) def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None): """ Cached version of OCR processing to reuse results Args: file_path: Path to the file to process file_type: Type of file (pdf or image) use_vision: Whether to use vision model file_size_mb: File size in MB cache_key: Cache key for the file preprocessing_options_hash: Hash of preprocessing options Returns: dict: OCR result """ # Initialize OCR processor processor = StructuredOCR() # Process the file with timing(f"OCR processing of {file_type} file"): result = processor.process_file( file_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb ) return result def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_reporter=None, pdf_dpi=150, max_pages=3, pdf_rotation=0, custom_prompt=None, perf_mode="Quality"): """ Process the uploaded file and return the OCR results Args: uploaded_file: The uploaded file to process use_vision: Whether to use vision model preprocessing_options: Dictionary of preprocessing options progress_reporter: ProgressReporter instance for UI updates pdf_dpi: DPI for PDF conversion max_pages: Maximum number of pages to process pdf_rotation: PDF rotation value custom_prompt: Custom prompt for OCR perf_mode: Performance mode (Quality or Speed) Returns: dict: OCR result """ if preprocessing_options is None: preprocessing_options = {} # Create a container for progress indicators if not provided if progress_reporter is None: from ui_components import ProgressReporter progress_reporter = ProgressReporter(st.empty()).setup() # Initialize temporary file paths list temp_file_paths = [] try: # Check if file size exceeds maximum allowed size is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue()) if not is_valid: progress_reporter.complete(success=False) st.error(error_message) return { "file_name": uploaded_file.name, "topics": ["Document"], "languages": ["English"], "error": error_message, "ocr_contents": { "error": error_message, "partial_text": "Document could not be processed due to size limitations." } } # Update progress progress_reporter.update(10, "Initializing OCR processor...") # Determine file type from extension file_ext = Path(uploaded_file.name).suffix.lower() file_type = "pdf" if file_ext == ".pdf" else "image" file_bytes = uploaded_file.getvalue() # For PDFs, we need to handle differently if file_type == "pdf": progress_reporter.update(20, "Converting PDF to images...") # Process PDF with direct handling progress_reporter.update(30, "Processing PDF with OCR...") # Create a temporary file for processing temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name with open(temp_path, 'wb') as f: f.write(file_bytes) temp_file_paths.append(temp_path) # Generate cache key cache_key = generate_cache_key( file_bytes, file_type, use_vision, preprocessing_options, pdf_rotation, custom_prompt ) # Process with cached function if possible try: result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options)) progress_reporter.update(90, "Finalizing results...") except Exception as e: logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.") progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...") # If caching fails, process directly processor = StructuredOCR() # Apply performance mode settings if perf_mode == "Speed": # Override settings for faster processing if pdf_dpi > 100: pdf_dpi = 100 # Lower DPI for speed # Process directly with optimized settings result = processor.process_file( file_path=temp_path, file_type="pdf", use_vision=use_vision, custom_prompt=custom_prompt, file_size_mb=file_size_mb, pdf_rotation=pdf_rotation ) progress_reporter.update(90, "Finalizing results...") else: # For image files progress_reporter.update(20, "Preparing image for processing...") # Apply preprocessing if needed temp_path, preprocessing_applied = apply_preprocessing_to_file( file_bytes, file_ext, preprocessing_options, temp_file_paths ) if preprocessing_applied: progress_reporter.update(30, "Applied image preprocessing...") # Generate cache key cache_key = generate_cache_key( open(temp_path, 'rb').read(), file_type, use_vision, preprocessing_options, 0, # No rotation for images (handled in preprocessing) custom_prompt ) # Process the file using cached function if possible progress_reporter.update(50, "Processing document with OCR...") try: result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options)) progress_reporter.update(80, "Analyzing document structure...") progress_reporter.update(90, "Finalizing results...") except Exception as e: logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.") progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...") # If caching fails, process directly processor = StructuredOCR() # Apply performance mode settings if perf_mode == "Speed": # Use simpler processing for speed pass # Any speed optimizations would be handled by the StructuredOCR class result = processor.process_file( file_path=temp_path, file_type=file_type, use_vision=use_vision, custom_prompt=custom_prompt, file_size_mb=file_size_mb ) progress_reporter.update(90, "Finalizing results...") # Add additional metadata to result result = process_result(result, uploaded_file, preprocessing_options) # Complete progress progress_reporter.complete() return result except Exception as e: # Handle errors error_message = handle_ocr_error(e, progress_reporter) # Return error result return { "file_name": uploaded_file.name, "topics": ["Document"], "languages": ["English"], "error": error_message, "ocr_contents": { "error": f"Failed to process file: {error_message}", "partial_text": "Document could not be processed due to an error." } } finally: # Clean up temporary files for temp_path in temp_file_paths: try: if os.path.exists(temp_path): os.unlink(temp_path) logger.info(f"Removed temporary file: {temp_path}") except Exception as e: logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}") def process_result(result, uploaded_file, preprocessing_options=None): """ Process OCR result to add metadata, tags, etc. Args: result: OCR result dictionary uploaded_file: The uploaded file preprocessing_options: Dictionary of preprocessing options Returns: dict: Processed OCR result """ # Add timestamp result['timestamp'] = format_timestamp() # Add processing time if not already present if 'processing_time' not in result: result['processing_time'] = 0.0 # Generate descriptive filename file_ext = Path(uploaded_file.name).suffix.lower() result['descriptive_file_name'] = create_descriptive_filename( uploaded_file.name, result, file_ext, preprocessing_options ) # Extract raw text from OCR contents raw_text = "" if 'ocr_contents' in result: if 'raw_text' in result['ocr_contents']: raw_text = result['ocr_contents']['raw_text'] elif 'content' in result['ocr_contents']: raw_text = result['ocr_contents']['content'] # Extract subject tags if not already present or enhance existing ones if 'topics' not in result or not result['topics']: result['topics'] = extract_subject_tags(result, raw_text, preprocessing_options) return result