Spaces:
Running
Running
""" | |
Script to verify that our fixes properly prioritize text from segmented regions | |
in the OCR output, ensuring images don't overshadow text content. | |
""" | |
import os | |
import json | |
import tempfile | |
from pathlib import Path | |
import logging | |
from PIL import Image | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def verify_fix(): | |
""" | |
Simulate the OCR process with segmentation to verify text prioritization | |
""" | |
print("Verifying segmentation and text prioritization fix...") | |
print("-" * 80) | |
# Create a simulated OCR result structure | |
ocr_result = { | |
"file_name": "test_document.jpg", | |
"topics": ["Document"], | |
"languages": ["English"], | |
"ocr_contents": { | |
"raw_text": "This is incorrect text that would be extracted from an image-focused OCR process.", | |
"title": "Test Document" | |
} | |
} | |
# Create simulated segmentation data that would be from our improved process | |
segmentation_data = { | |
'text_regions_coordinates': [(10, 10, 100, 20), (10, 40, 100, 20)], | |
'regions_count': 2, | |
'segmentation_applied': True, | |
'combined_text': "FIFTH AVENUE AT FIFTENTH STREET, NORTH\n\nBIRMINGHAM 2, ALABAMA\n\nDear Mary:\n\nHaving received your letter, I wanted to respond promptly.", | |
'region_results': [ | |
{ | |
'text': "FIFTH AVENUE AT FIFTENTH STREET, NORTH", | |
'coordinates': (10, 10, 100, 20), | |
'order': 0 | |
}, | |
{ | |
'text': "BIRMINGHAM 2, ALABAMA", | |
'coordinates': (10, 40, 100, 20), | |
'order': 1 | |
} | |
] | |
} | |
# Create preprocessing options with segmentation data | |
preprocessing_options = { | |
'document_type': 'letter', | |
'segmentation_data': segmentation_data | |
} | |
# Import the clean_ocr_result function to test | |
from utils.image_utils import clean_ocr_result | |
# Process the result to see how text is prioritized | |
print("Original OCR text (before fix): ") | |
print(f" '{ocr_result['ocr_contents']['raw_text']}'") | |
print() | |
# Use our improved clean_ocr_result function | |
cleaned_result = clean_ocr_result( | |
ocr_result, | |
use_segmentation=True, | |
vision_enabled=True, | |
preprocessing_options=preprocessing_options | |
) | |
# Print the results to verify text prioritization | |
print("After applying fix (should prioritize segmented text):") | |
if 'segmentation_text' in cleaned_result['ocr_contents']: | |
print("✓ Segmentation text was properly added to results") | |
print(f" Segmentation text: '{cleaned_result['ocr_contents']['segmentation_text']}'") | |
else: | |
print("✗ Segmentation text was NOT added to results") | |
if cleaned_result['ocr_contents'].get('main_text') == segmentation_data['combined_text']: | |
print("✓ Segmentation text was correctly used as the main text") | |
else: | |
print("✗ Segmentation text was NOT used as the main text") | |
if 'original_raw_text' in cleaned_result['ocr_contents']: | |
print("✓ Original raw text was preserved as a backup") | |
else: | |
print("✗ Original raw text was NOT preserved") | |
if cleaned_result['ocr_contents'].get('raw_text') == segmentation_data['combined_text']: | |
print("✓ Raw text was correctly replaced with segmentation text") | |
else: | |
print("✗ Raw text was NOT replaced with segmentation text") | |
print() | |
print("Final OCR text content:") | |
print("-" * 30) | |
print(cleaned_result['ocr_contents'].get('raw_text', "No text found")) | |
print("-" * 30) | |
print() | |
print("Conclusion:") | |
if (cleaned_result['ocr_contents'].get('raw_text') == segmentation_data['combined_text'] and | |
cleaned_result['ocr_contents'].get('main_text') == segmentation_data['combined_text']): | |
print("✅ Fix successfully prioritizes text from segmented regions!") | |
else: | |
print("❌ Fix did NOT correctly prioritize text from segmented regions.") | |
if __name__ == "__main__": | |
verify_fix() | |