Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Test script for adaptive content-aware segmentation. | |
Processes sample documents to validate the improved segmentation approach. | |
""" | |
import os | |
import sys | |
import logging | |
from pathlib import Path | |
import cv2 | |
import numpy as np | |
from PIL import Image | |
import json | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Import segmentation module | |
from image_segmentation import segment_image_for_ocr, process_segmented_image | |
# Test documents | |
TEST_DOCUMENTS = [ | |
"input/baldwin-15th-north.jpg", # Document with varied text density and uppercase sections | |
"input/americae-retectio.jpg", # Historical document | |
"input/handwritten-letter.jpg", # Handwritten document | |
] | |
def test_adaptive_segmentation(): | |
""" | |
Run the adaptive segmentation on test documents and visualize the results. | |
""" | |
# Create output directory | |
output_dir = Path("output") / "adaptive_test" | |
output_dir.mkdir(parents=True, exist_ok=True) | |
results = {} | |
# Process each test document | |
for document_path in TEST_DOCUMENTS: | |
document_file = Path(document_path) | |
if not document_file.exists(): | |
logger.warning(f"Test document not found: {document_path}") | |
continue | |
logger.info(f"Processing test document: {document_file.name}") | |
# Process the document | |
segmentation_results = process_segmented_image(document_file, output_dir) | |
# Create a combined visualization | |
if segmentation_results.get('text_regions_coordinates'): | |
# Print analysis | |
logger.info(f"Document: {document_file.name}") | |
logger.info(f"Found {len(segmentation_results['text_regions_coordinates'])} text regions") | |
logger.info(f"Output files: {segmentation_results.get('output_files', {})}") | |
# Store results | |
results[document_file.name] = { | |
"regions_count": len(segmentation_results['text_regions_coordinates']), | |
"output_files": segmentation_results.get('output_files', {}), | |
"regions": segmentation_results.get('text_regions_coordinates', []) | |
} | |
# Save summary report | |
with open(output_dir / "adaptive_segmentation_results.json", "w") as f: | |
json.dump(results, f, indent=2) | |
# Create a summary report | |
with open(output_dir / "adaptive_segmentation_report.md", "w") as f: | |
f.write("# Adaptive Segmentation Test Results\n\n") | |
f.write("This report summarizes the results of testing the adaptive content-aware segmentation approach.\n\n") | |
for document_name, result in results.items(): | |
f.write(f"## {document_name}\n\n") | |
f.write(f"- Regions detected: {result['regions_count']}\n") | |
f.write(f"- Output files:\n") | |
for file_type, file_path in result.get('output_files', {}).items(): | |
f.write(f" - {file_type}: {file_path}\n") | |
f.write("\n") | |
# Add region analysis | |
if result.get('regions'): | |
f.write("### Region Analysis\n\n") | |
f.write("| Region | X | Y | Width | Height |\n") | |
f.write("|--------|---|---|-------|--------|\n") | |
for i, region in enumerate(result['regions']): | |
x, y, w, h = region | |
f.write(f"| {i+1} | {x} | {y} | {w} | {h} |\n") | |
f.write("\n") | |
logger.info(f"Test completed. Results saved to {output_dir}") | |
logger.info(f"Summary report: {output_dir / 'adaptive_segmentation_report.md'}") | |
if __name__ == "__main__": | |
test_adaptive_segmentation() |