Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

milwright commited on May 3

Commit

51f51ba

verified ·

1 Parent(s): 8b46e90

Delete test_segmentation_fix.py

Browse files

Files changed (1) hide show

test_segmentation_fix.py +0 -100

test_segmentation_fix.py DELETED Viewed

@@ -1,100 +0,0 @@
-"""
-Test script to verify the segmentation and OCR improvements.
-This script will process an image using the updated segmentation algorithm
-and show how text recognition is prioritized over images.
-"""
-import os
-import json
-import tempfile
-from pathlib import Path
-from PIL import Image
-# Import the key components we modified
-from image_segmentation import segment_image_for_ocr
-from ocr_processing import process_file, process_result
-from utils.image_utils import clean_ocr_result
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def run_test(image_path):
-    """Run a test on the specified image to verify our fixes"""
-    print(f"Testing image segmentation and OCR prioritization on: {image_path}")
-    print("-" * 80)
-    # Make sure the image exists
-    if not os.path.exists(image_path):
-        print(f"Error: Image not found at {image_path}")
-        return
-    # 1. First run image segmentation directly
-    try:
-        print("Step 1: Running image segmentation...")
-        segmentation_results = segment_image_for_ocr(
-            image_path,
-            vision_enabled=True,
-            preserve_content=True
-        )
-        # Print segmentation info
-        text_regions_count = len(segmentation_results.get('text_regions_coordinates', []))
-        print(f"Detected {text_regions_count} text regions in the image")
-        # Save output images for inspection
-        output_dir = Path("output/segmentation_test")
-        output_dir.mkdir(parents=True, exist_ok=True)
-        if segmentation_results['text_regions'] is not None:
-            output_path = output_dir / f"text_regions_improved.jpg"
-            segmentation_results['text_regions'].save(output_path)
-            print(f"Saved text regions visualization to: {output_path}")
-        if segmentation_results['image_regions'] is not None:
-            output_path = output_dir / f"image_regions_improved.jpg"
-            segmentation_results['image_regions'].save(output_path)
-            print(f"Saved image regions visualization to: {output_path}")
-        if segmentation_results['combined_result'] is not None:
-            output_path = output_dir / f"combined_result_improved.jpg"
-            segmentation_results['combined_result'].save(output_path)
-            print(f"Saved combined result to: {output_path}")
-        # Extract individual text regions if available
-        if 'region_images' in segmentation_results and segmentation_results['region_images']:
-            region_dir = output_dir / "text_regions"
-            region_dir.mkdir(exist_ok=True)
-            for idx, region_info in enumerate(segmentation_results['region_images']):
-                region_path = region_dir / f"region_{idx+1}.jpg"
-                region_info['pil_image'].save(region_path)
-            print(f"Saved {len(segmentation_results['region_images'])} individual text regions to {region_dir}")
-    except Exception as e:
-        print(f"Error during segmentation: {str(e)}")
-    print("-" * 80)
-    print("Test complete. Check the output directory for results.")
-    print("The text regions should now properly include all text content in the document.")
-    print("Image regions should be minimal and not contain text.")
-if __name__ == "__main__":
-    # Test with an image that has mixed text and image content
-    # You can change this to any image path you want to test
-    test_image = "input/baldwin-letter.jpg"
-    if not os.path.exists(test_image):
-        print(f"Test image not found at {test_image}, looking for alternatives...")
-        # Try to find an alternative test image
-        for potential_img in ["input/harpers.pdf", "input/magician-or-bottle-cungerer.jpg", "input/magellan-travels.jpg"]:
-            if os.path.exists(potential_img):
-                test_image = potential_img
-                print(f"Using alternative test image: {test_image}")
-                break
-    if os.path.exists(test_image):
-        run_test(test_image)
-    else:
-        print("No suitable test images found. Please place an image in the input directory.")