""" Script to verify that our fixes properly prioritize text from segmented regions in the OCR output, ensuring images don't overshadow text content. """ import os import json import tempfile from pathlib import Path import logging from PIL import Image # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def verify_fix(): """ Simulate the OCR process with segmentation to verify text prioritization """ print("Verifying segmentation and text prioritization fix...") print("-" * 80) # Create a simulated OCR result structure ocr_result = { "file_name": "test_document.jpg", "topics": ["Document"], "languages": ["English"], "ocr_contents": { "raw_text": "This is incorrect text that would be extracted from an image-focused OCR process.", "title": "Test Document" } } # Create simulated segmentation data that would be from our improved process segmentation_data = { 'text_regions_coordinates': [(10, 10, 100, 20), (10, 40, 100, 20)], 'regions_count': 2, 'segmentation_applied': True, 'combined_text': "FIFTH AVENUE AT FIFTENTH STREET, NORTH\n\nBIRMINGHAM 2, ALABAMA\n\nDear Mary:\n\nHaving received your letter, I wanted to respond promptly.", 'region_results': [ { 'text': "FIFTH AVENUE AT FIFTENTH STREET, NORTH", 'coordinates': (10, 10, 100, 20), 'order': 0 }, { 'text': "BIRMINGHAM 2, ALABAMA", 'coordinates': (10, 40, 100, 20), 'order': 1 } ] } # Create preprocessing options with segmentation data preprocessing_options = { 'document_type': 'letter', 'segmentation_data': segmentation_data } # Import the clean_ocr_result function to test from utils.image_utils import clean_ocr_result # Process the result to see how text is prioritized print("Original OCR text (before fix): ") print(f" '{ocr_result['ocr_contents']['raw_text']}'") print() # Use our improved clean_ocr_result function cleaned_result = clean_ocr_result( ocr_result, use_segmentation=True, vision_enabled=True, preprocessing_options=preprocessing_options ) # Print the results to verify text prioritization print("After applying fix (should prioritize segmented text):") if 'segmentation_text' in cleaned_result['ocr_contents']: print("✓ Segmentation text was properly added to results") print(f" Segmentation text: '{cleaned_result['ocr_contents']['segmentation_text']}'") else: print("✗ Segmentation text was NOT added to results") if cleaned_result['ocr_contents'].get('main_text') == segmentation_data['combined_text']: print("✓ Segmentation text was correctly used as the main text") else: print("✗ Segmentation text was NOT used as the main text") if 'original_raw_text' in cleaned_result['ocr_contents']: print("✓ Original raw text was preserved as a backup") else: print("✗ Original raw text was NOT preserved") if cleaned_result['ocr_contents'].get('raw_text') == segmentation_data['combined_text']: print("✓ Raw text was correctly replaced with segmentation text") else: print("✗ Raw text was NOT replaced with segmentation text") print() print("Final OCR text content:") print("-" * 30) print(cleaned_result['ocr_contents'].get('raw_text', "No text found")) print("-" * 30) print() print("Conclusion:") if (cleaned_result['ocr_contents'].get('raw_text') == segmentation_data['combined_text'] and cleaned_result['ocr_contents'].get('main_text') == segmentation_data['combined_text']): print("✅ Fix successfully prioritizes text from segmented regions!") else: print("❌ Fix did NOT correctly prioritize text from segmented regions.") if __name__ == "__main__": verify_fix()