Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / verify_segmentation_fix.py

milwright

Consolidate segmentation improvements and code cleanup

42dc069 3 months ago

raw

history blame

4.18 kB

	"""
	Script to verify that our fixes properly prioritize text from segmented regions
	in the OCR output, ensuring images don't overshadow text content.
	"""

	import os
	import json
	import tempfile
	from pathlib import Path
	import logging
	from PIL import Image

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def verify_fix():
	"""
	Simulate the OCR process with segmentation to verify text prioritization
	"""
	print("Verifying segmentation and text prioritization fix...")
	print("-" * 80)

	# Create a simulated OCR result structure
	ocr_result = {
	"file_name": "test_document.jpg",
	"topics": ["Document"],
	"languages": ["English"],
	"ocr_contents": {
	"raw_text": "This is incorrect text that would be extracted from an image-focused OCR process.",
	"title": "Test Document"
	}
	}

	# Create simulated segmentation data that would be from our improved process
	segmentation_data = {
	'text_regions_coordinates': [(10, 10, 100, 20), (10, 40, 100, 20)],
	'regions_count': 2,
	'segmentation_applied': True,
	'combined_text': "FIFTH AVENUE AT FIFTENTH STREET, NORTH\n\nBIRMINGHAM 2, ALABAMA\n\nDear Mary:\n\nHaving received your letter, I wanted to respond promptly.",
	'region_results': [
	{
	'text': "FIFTH AVENUE AT FIFTENTH STREET, NORTH",
	'coordinates': (10, 10, 100, 20),
	'order': 0
	},
	{
	'text': "BIRMINGHAM 2, ALABAMA",
	'coordinates': (10, 40, 100, 20),
	'order': 1
	}
	]
	}

	# Create preprocessing options with segmentation data
	preprocessing_options = {
	'document_type': 'letter',
	'segmentation_data': segmentation_data
	}

	# Import the clean_ocr_result function to test
	from utils.image_utils import clean_ocr_result

	# Process the result to see how text is prioritized
	print("Original OCR text (before fix): ")
	print(f" '{ocr_result['ocr_contents']['raw_text']}'")
	print()

	# Use our improved clean_ocr_result function
	cleaned_result = clean_ocr_result(
	ocr_result,
	use_segmentation=True,
	vision_enabled=True,
	preprocessing_options=preprocessing_options
	)

	# Print the results to verify text prioritization
	print("After applying fix (should prioritize segmented text):")

	if 'segmentation_text' in cleaned_result['ocr_contents']:
	print("✓ Segmentation text was properly added to results")
	print(f" Segmentation text: '{cleaned_result['ocr_contents']['segmentation_text']}'")
	else:
	print("✗ Segmentation text was NOT added to results")

	if cleaned_result['ocr_contents'].get('main_text') == segmentation_data['combined_text']:
	print("✓ Segmentation text was correctly used as the main text")
	else:
	print("✗ Segmentation text was NOT used as the main text")

	if 'original_raw_text' in cleaned_result['ocr_contents']:
	print("✓ Original raw text was preserved as a backup")
	else:
	print("✗ Original raw text was NOT preserved")

	if cleaned_result['ocr_contents'].get('raw_text') == segmentation_data['combined_text']:
	print("✓ Raw text was correctly replaced with segmentation text")
	else:
	print("✗ Raw text was NOT replaced with segmentation text")

	print()
	print("Final OCR text content:")
	print("-" * 30)
	print(cleaned_result['ocr_contents'].get('raw_text', "No text found"))
	print("-" * 30)

	print()
	print("Conclusion:")
	if (cleaned_result['ocr_contents'].get('raw_text') == segmentation_data['combined_text'] and
	cleaned_result['ocr_contents'].get('main_text') == segmentation_data['combined_text']):
	print("✅ Fix successfully prioritizes text from segmented regions!")
	else:
	print("❌ Fix did NOT correctly prioritize text from segmented regions.")

	if __name__ == "__main__":
	verify_fix()