historical-ocr / verify_segmentation_fix.py
milwright's picture
Consolidate segmentation improvements and code cleanup
42dc069
raw
history blame
4.18 kB
"""
Script to verify that our fixes properly prioritize text from segmented regions
in the OCR output, ensuring images don't overshadow text content.
"""
import os
import json
import tempfile
from pathlib import Path
import logging
from PIL import Image
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def verify_fix():
"""
Simulate the OCR process with segmentation to verify text prioritization
"""
print("Verifying segmentation and text prioritization fix...")
print("-" * 80)
# Create a simulated OCR result structure
ocr_result = {
"file_name": "test_document.jpg",
"topics": ["Document"],
"languages": ["English"],
"ocr_contents": {
"raw_text": "This is incorrect text that would be extracted from an image-focused OCR process.",
"title": "Test Document"
}
}
# Create simulated segmentation data that would be from our improved process
segmentation_data = {
'text_regions_coordinates': [(10, 10, 100, 20), (10, 40, 100, 20)],
'regions_count': 2,
'segmentation_applied': True,
'combined_text': "FIFTH AVENUE AT FIFTENTH STREET, NORTH\n\nBIRMINGHAM 2, ALABAMA\n\nDear Mary:\n\nHaving received your letter, I wanted to respond promptly.",
'region_results': [
{
'text': "FIFTH AVENUE AT FIFTENTH STREET, NORTH",
'coordinates': (10, 10, 100, 20),
'order': 0
},
{
'text': "BIRMINGHAM 2, ALABAMA",
'coordinates': (10, 40, 100, 20),
'order': 1
}
]
}
# Create preprocessing options with segmentation data
preprocessing_options = {
'document_type': 'letter',
'segmentation_data': segmentation_data
}
# Import the clean_ocr_result function to test
from utils.image_utils import clean_ocr_result
# Process the result to see how text is prioritized
print("Original OCR text (before fix): ")
print(f" '{ocr_result['ocr_contents']['raw_text']}'")
print()
# Use our improved clean_ocr_result function
cleaned_result = clean_ocr_result(
ocr_result,
use_segmentation=True,
vision_enabled=True,
preprocessing_options=preprocessing_options
)
# Print the results to verify text prioritization
print("After applying fix (should prioritize segmented text):")
if 'segmentation_text' in cleaned_result['ocr_contents']:
print("✓ Segmentation text was properly added to results")
print(f" Segmentation text: '{cleaned_result['ocr_contents']['segmentation_text']}'")
else:
print("✗ Segmentation text was NOT added to results")
if cleaned_result['ocr_contents'].get('main_text') == segmentation_data['combined_text']:
print("✓ Segmentation text was correctly used as the main text")
else:
print("✗ Segmentation text was NOT used as the main text")
if 'original_raw_text' in cleaned_result['ocr_contents']:
print("✓ Original raw text was preserved as a backup")
else:
print("✗ Original raw text was NOT preserved")
if cleaned_result['ocr_contents'].get('raw_text') == segmentation_data['combined_text']:
print("✓ Raw text was correctly replaced with segmentation text")
else:
print("✗ Raw text was NOT replaced with segmentation text")
print()
print("Final OCR text content:")
print("-" * 30)
print(cleaned_result['ocr_contents'].get('raw_text', "No text found"))
print("-" * 30)
print()
print("Conclusion:")
if (cleaned_result['ocr_contents'].get('raw_text') == segmentation_data['combined_text'] and
cleaned_result['ocr_contents'].get('main_text') == segmentation_data['combined_text']):
print("✅ Fix successfully prioritizes text from segmented regions!")
else:
print("❌ Fix did NOT correctly prioritize text from segmented regions.")
if __name__ == "__main__":
verify_fix()