historical-ocr / test_adaptive_segmentation.py
milwright's picture
Fix OCR processing variable scope issue by using explicit module reference for apply_preprocessing_to_file
3dd2ff2
raw
history blame
3.83 kB
#!/usr/bin/env python3
"""
Test script for adaptive content-aware segmentation.
Processes sample documents to validate the improved segmentation approach.
"""
import os
import sys
import logging
from pathlib import Path
import cv2
import numpy as np
from PIL import Image
import json
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Import segmentation module
from image_segmentation import segment_image_for_ocr, process_segmented_image
# Test documents
TEST_DOCUMENTS = [
"input/baldwin-15th-north.jpg", # Document with varied text density and uppercase sections
"input/americae-retectio.jpg", # Historical document
"input/handwritten-letter.jpg", # Handwritten document
]
def test_adaptive_segmentation():
"""
Run the adaptive segmentation on test documents and visualize the results.
"""
# Create output directory
output_dir = Path("output") / "adaptive_test"
output_dir.mkdir(parents=True, exist_ok=True)
results = {}
# Process each test document
for document_path in TEST_DOCUMENTS:
document_file = Path(document_path)
if not document_file.exists():
logger.warning(f"Test document not found: {document_path}")
continue
logger.info(f"Processing test document: {document_file.name}")
# Process the document
segmentation_results = process_segmented_image(document_file, output_dir)
# Create a combined visualization
if segmentation_results.get('text_regions_coordinates'):
# Print analysis
logger.info(f"Document: {document_file.name}")
logger.info(f"Found {len(segmentation_results['text_regions_coordinates'])} text regions")
logger.info(f"Output files: {segmentation_results.get('output_files', {})}")
# Store results
results[document_file.name] = {
"regions_count": len(segmentation_results['text_regions_coordinates']),
"output_files": segmentation_results.get('output_files', {}),
"regions": segmentation_results.get('text_regions_coordinates', [])
}
# Save summary report
with open(output_dir / "adaptive_segmentation_results.json", "w") as f:
json.dump(results, f, indent=2)
# Create a summary report
with open(output_dir / "adaptive_segmentation_report.md", "w") as f:
f.write("# Adaptive Segmentation Test Results\n\n")
f.write("This report summarizes the results of testing the adaptive content-aware segmentation approach.\n\n")
for document_name, result in results.items():
f.write(f"## {document_name}\n\n")
f.write(f"- Regions detected: {result['regions_count']}\n")
f.write(f"- Output files:\n")
for file_type, file_path in result.get('output_files', {}).items():
f.write(f" - {file_type}: {file_path}\n")
f.write("\n")
# Add region analysis
if result.get('regions'):
f.write("### Region Analysis\n\n")
f.write("| Region | X | Y | Width | Height |\n")
f.write("|--------|---|---|-------|--------|\n")
for i, region in enumerate(result['regions']):
x, y, w, h = region
f.write(f"| {i+1} | {x} | {y} | {w} | {h} |\n")
f.write("\n")
logger.info(f"Test completed. Results saved to {output_dir}")
logger.info(f"Summary report: {output_dir / 'adaptive_segmentation_report.md'}")
if __name__ == "__main__":
test_adaptive_segmentation()