Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

milwright commited on Jun 12

Commit

cfec41e

verified ·

1 Parent(s): 32b817f

Delete testing

Browse files

Files changed (10) hide show

testing/magician_test/branch_comparison.txt +0 -20
testing/magician_test/processed_magician.jpg +0 -3
testing/magician_test/test_report.txt +0 -16
testing/newspaper_test/newspaper_comparison.jpg +0 -3
testing/newspaper_test/newspaper_test_report.txt +0 -18
testing/newspaper_test/processed_newspaper.jpg +0 -3
testing/output/processed_magician.jpg +0 -3
testing/output/test_report.txt +0 -16
testing/test_filename_format.py +0 -93
testing/test_json_bleed.py +0 -46

testing/magician_test/branch_comparison.txt DELETED Viewed

@@ -1,20 +0,0 @@
-Comparison of ocr_utils.py between main and reconcile-improvements branches
-==================================================================
-Key improvements in reconcile-improvements branch:
-1. Enhanced illustration/etching detection:
-   - Added detection based on filename keywords (e.g., 'magician', 'illustration')
-   - Implemented image-based detection using edge density analysis
-2. Specialized processing for illustrations:
-   - Gentler scaling to preserve fine details
-   - Mild contrast enhancement (1.3 vs. higher values for other documents)
-   - Specialized sharpening for fine lines in etchings
-   - Higher quality settings (95 vs. 85) to prevent detail loss
-3. Performance optimizations:
-   - More efficient processing paths for different image types
-   - Better memory management for large images
-Test results for magician-or-bottle-cungerer.jpg demonstrate these improvements.

testing/magician_test/processed_magician.jpg DELETED Viewed

Git LFS Details

SHA256: 8824abe6e81e6b7847eca83e39fda77c3b6937d292f3647078ba4af2531d65ff
Pointer size: 132 Bytes
Size of remote file: 2.33 MB

testing/magician_test/test_report.txt DELETED Viewed

@@ -1,16 +0,0 @@
-Test Report: Magician Image Processing
-=====================================
-Original image: input/magician-or-bottle-cungerer.jpg
-Original size: 2500x2116
-Processed size: 2500x2116
-Processing time: 0.58 seconds
-Size reduction: 0.00%
-Illustration Detection:
-- Filename contains 'magician': True
-Visual Inspection Notes:
-- Check processed_magician.jpg for preservation of fine details
-- Verify that etching lines are clear and not over-processed
-- Confirm that contrast enhancement is appropriate for this illustration

testing/newspaper_test/newspaper_comparison.jpg DELETED Viewed

Git LFS Details

SHA256: 1a48abfd88f516f704f574b8d3d372c07d2c71a82e5743eae205aece7d77c2de
Pointer size: 132 Bytes
Size of remote file: 3.58 MB

testing/newspaper_test/newspaper_test_report.txt DELETED Viewed

@@ -1,18 +0,0 @@
-Newspaper Detection Test Report
-==============================
-Original image: input/magician-or-bottle-cungerer.jpg
-Original size: 2500x2116
-Processed size: 2000x1692
-Processing time: 0.71 seconds
-Aspect ratio: 1.18
-Meets newspaper criteria by dimensions: False
-Size reduction: 36.03%
-Notes on Newspaper Processing:
-- Newspaper format should be detected based on dimensions and aspect ratio
-- Specialized processing should be applied for newspaper text extraction
-- Check if the processed image shows enhanced text clarity in columns
-- Verify that the column structure is preserved for better OCR results

testing/newspaper_test/processed_newspaper.jpg DELETED Viewed

Git LFS Details

SHA256: c1a856a643e381b7312ca16931ca33a3b670dbf456357f8a7c5e91fd92ce7b5f
Pointer size: 132 Bytes
Size of remote file: 1.6 MB

testing/output/processed_magician.jpg DELETED Viewed

Git LFS Details

SHA256: 8824abe6e81e6b7847eca83e39fda77c3b6937d292f3647078ba4af2531d65ff
Pointer size: 132 Bytes
Size of remote file: 2.33 MB

testing/output/test_report.txt DELETED Viewed

@@ -1,16 +0,0 @@
-Test Report: Magician Image Processing
-=====================================
-Original image: input/magician-or-bottle-cungerer.jpg
-Original size: 2500x2116
-Processed size: 2500x2116
-Processing time: 0.58 seconds
-Size reduction: 0.00%
-Illustration Detection:
-- Filename contains 'magician': True
-Visual Inspection Notes:
-- Check processed_magician.jpg for preservation of fine details
-- Verify that etching lines are clear and not over-processed
-- Confirm that contrast enhancement is appropriate for this illustration

testing/test_filename_format.py DELETED Viewed

@@ -1,93 +0,0 @@
-"""Test the new filename formatting"""
-import os
-import sys
-import datetime
-import inspect
-# Add the project root to the path so we can import modules
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-# Import the main utils.py file directly
-import utils as root_utils
-print(f"Imported utils from: {root_utils.__file__}")
-print("Current create_descriptive_filename implementation:")
-print(inspect.getsource(root_utils.create_descriptive_filename))
-def main():
-    """Test the filename formatting"""
-    # Sample inputs
-    sample_files = [
-        "handwritten-letter.jpg",
-        "magician-or-bottle-cungerer.jpg",
-        "baldwin_15th_north.jpg",
-        "harpers.pdf",
-        "recipe.jpg"
-    ]
-    # Sample OCR results for testing
-    sample_results = [
-        {
-            "detected_document_type": "handwritten",
-            "topics": ["Letter", "Handwritten", "19th Century", "Personal Correspondence"]
-        },
-        {
-            "topics": ["Newspaper", "Print", "19th Century", "Illustration", "Advertisement"]
-        },
-        {
-            "detected_document_type": "letter",
-            "topics": ["Correspondence", "Early Modern", "English Language"]
-        },
-        {
-            "detected_document_type": "magazine",
-            "topics": ["Publication", "Late 19th Century", "Magazine", "Historical"]
-        },
-        {
-            "detected_document_type": "recipe",
-            "topics": ["Food", "Culinary", "Historical", "Instruction"]
-        }
-    ]
-    print("\nIMPROVED FILENAME FORMATTING TEST")
-    print("=" * 50)
-    # Format current date manually
-    current_date = datetime.datetime.now().strftime("%b %d, %Y")
-    print(f"Current date for filenames: {current_date}")
-    print("\nBEFORE vs AFTER Examples:\n")
-    for i, (original_file, result) in enumerate(zip(sample_files, sample_results)):
-        # Get file extension from original file
-        file_ext = os.path.splitext(original_file)[1]
-        # Generate the old style filename manually
-        original_name = os.path.splitext(original_file)[0]
-        doc_type_tag = ""
-        if 'detected_document_type' in result:
-            doc_type = result['detected_document_type'].lower()
-            doc_type_tag = f"_{doc_type.replace(' ', '_')}"
-        elif 'topics' in result and result['topics']:
-            doc_type_tag = f"_{result['topics'][0].lower().replace(' ', '_')}"
-        period_tag = ""
-        if 'topics' in result and result['topics']:
-            for tag in result['topics']:
-                if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
-                    period_tag = f"_{tag.lower().replace(' ', '_')}"
-                    break
-        old_filename = f"{original_name}{doc_type_tag}{period_tag}{file_ext}"
-        # Generate the new descriptive filename with our improved formatter
-        new_filename = root_utils.create_descriptive_filename(original_file, result, file_ext)
-        print(f"Example {i+1}:")
-        print(f"  Original: {original_file}")
-        print(f"  Old Format: {old_filename}")
-        print(f"  New Format: {new_filename}")
-        print()
-if __name__ == "__main__":
-    main()

testing/test_json_bleed.py DELETED Viewed

@@ -1,46 +0,0 @@
-"""
-Test case to verify the fix for JSON bleed-through in historical text.
-"""
-import sys
-import os
-from pathlib import Path
-# Add parent directory to path
-sys.path.append(str(Path(__file__).parent.parent))
-from utils.content_utils import format_structured_data
-from utils.text_utils import clean_raw_text, format_markdown_text
-# Sample text with JSON-like content (historical text with curly braces)
-SAMPLE_TEXT = """# ENGLISH Credulity; or Ye're all Bottled.
-O magnus pofldac Inimicis Rifus! Hor. Sat. WITH Grief, Refentment, and averted Eyes, Britannia droops to fee her Sons, (once Wile So fam'd for Arms, for Conduct fo renown'd With ev'ry Virtue ev'ry Glory crown'd) Now fink ignoble, and to nothing fall; Obedient marching forth at Folly's Call.
-Text containing curly braces like these: { and } should not be parsed as JSON.
-Even this text with a JSON-like pattern {"key": "value"} should be preserved as-is.
-"""
-def test_format_structured_data():
-    """Test that format_structured_data preserves text content"""
-    result = format_structured_data(SAMPLE_TEXT)
-    # Verify the text is returned as-is without attempting to parse JSON-like structures
-    assert result == SAMPLE_TEXT
-    print("✓ format_structured_data correctly preserves text content")
-    # Make sure the output doesn't have any JSON code blocks
-    assert "```json" not in result
-    print("✓ format_structured_data does not create JSON code blocks")
-    return True
-if __name__ == "__main__":
-    # Run the test
-    print("Running JSON bleed-through fix tests...\n")
-    success = test_format_structured_data()
-    if success:
-        print("\nAll tests passed! The JSON bleed-through issue is fixed.")
-    else:
-        print("\nSome tests failed.")