Spaces:
Running
Running
Delete testing
Browse files- testing/magician_test/branch_comparison.txt +0 -20
- testing/magician_test/processed_magician.jpg +0 -3
- testing/magician_test/test_report.txt +0 -16
- testing/newspaper_test/newspaper_comparison.jpg +0 -3
- testing/newspaper_test/newspaper_test_report.txt +0 -18
- testing/newspaper_test/processed_newspaper.jpg +0 -3
- testing/output/processed_magician.jpg +0 -3
- testing/output/test_report.txt +0 -16
- testing/test_filename_format.py +0 -93
- testing/test_json_bleed.py +0 -46
testing/magician_test/branch_comparison.txt
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
Comparison of ocr_utils.py between main and reconcile-improvements branches
|
2 |
-
==================================================================
|
3 |
-
|
4 |
-
Key improvements in reconcile-improvements branch:
|
5 |
-
|
6 |
-
1. Enhanced illustration/etching detection:
|
7 |
-
- Added detection based on filename keywords (e.g., 'magician', 'illustration')
|
8 |
-
- Implemented image-based detection using edge density analysis
|
9 |
-
|
10 |
-
2. Specialized processing for illustrations:
|
11 |
-
- Gentler scaling to preserve fine details
|
12 |
-
- Mild contrast enhancement (1.3 vs. higher values for other documents)
|
13 |
-
- Specialized sharpening for fine lines in etchings
|
14 |
-
- Higher quality settings (95 vs. 85) to prevent detail loss
|
15 |
-
|
16 |
-
3. Performance optimizations:
|
17 |
-
- More efficient processing paths for different image types
|
18 |
-
- Better memory management for large images
|
19 |
-
|
20 |
-
Test results for magician-or-bottle-cungerer.jpg demonstrate these improvements.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
testing/magician_test/processed_magician.jpg
DELETED
Git LFS Details
|
testing/magician_test/test_report.txt
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
Test Report: Magician Image Processing
|
2 |
-
=====================================
|
3 |
-
|
4 |
-
Original image: input/magician-or-bottle-cungerer.jpg
|
5 |
-
Original size: 2500x2116
|
6 |
-
Processed size: 2500x2116
|
7 |
-
Processing time: 0.58 seconds
|
8 |
-
Size reduction: 0.00%
|
9 |
-
|
10 |
-
Illustration Detection:
|
11 |
-
- Filename contains 'magician': True
|
12 |
-
|
13 |
-
Visual Inspection Notes:
|
14 |
-
- Check processed_magician.jpg for preservation of fine details
|
15 |
-
- Verify that etching lines are clear and not over-processed
|
16 |
-
- Confirm that contrast enhancement is appropriate for this illustration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
testing/newspaper_test/newspaper_comparison.jpg
DELETED
Git LFS Details
|
testing/newspaper_test/newspaper_test_report.txt
DELETED
@@ -1,18 +0,0 @@
|
|
1 |
-
Newspaper Detection Test Report
|
2 |
-
==============================
|
3 |
-
|
4 |
-
Original image: input/magician-or-bottle-cungerer.jpg
|
5 |
-
Original size: 2500x2116
|
6 |
-
Processed size: 2000x1692
|
7 |
-
Processing time: 0.71 seconds
|
8 |
-
|
9 |
-
Aspect ratio: 1.18
|
10 |
-
Meets newspaper criteria by dimensions: False
|
11 |
-
|
12 |
-
Size reduction: 36.03%
|
13 |
-
|
14 |
-
Notes on Newspaper Processing:
|
15 |
-
- Newspaper format should be detected based on dimensions and aspect ratio
|
16 |
-
- Specialized processing should be applied for newspaper text extraction
|
17 |
-
- Check if the processed image shows enhanced text clarity in columns
|
18 |
-
- Verify that the column structure is preserved for better OCR results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
testing/newspaper_test/processed_newspaper.jpg
DELETED
Git LFS Details
|
testing/output/processed_magician.jpg
DELETED
Git LFS Details
|
testing/output/test_report.txt
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
Test Report: Magician Image Processing
|
2 |
-
=====================================
|
3 |
-
|
4 |
-
Original image: input/magician-or-bottle-cungerer.jpg
|
5 |
-
Original size: 2500x2116
|
6 |
-
Processed size: 2500x2116
|
7 |
-
Processing time: 0.58 seconds
|
8 |
-
Size reduction: 0.00%
|
9 |
-
|
10 |
-
Illustration Detection:
|
11 |
-
- Filename contains 'magician': True
|
12 |
-
|
13 |
-
Visual Inspection Notes:
|
14 |
-
- Check processed_magician.jpg for preservation of fine details
|
15 |
-
- Verify that etching lines are clear and not over-processed
|
16 |
-
- Confirm that contrast enhancement is appropriate for this illustration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
testing/test_filename_format.py
DELETED
@@ -1,93 +0,0 @@
|
|
1 |
-
"""Test the new filename formatting"""
|
2 |
-
import os
|
3 |
-
import sys
|
4 |
-
import datetime
|
5 |
-
import inspect
|
6 |
-
|
7 |
-
# Add the project root to the path so we can import modules
|
8 |
-
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9 |
-
|
10 |
-
# Import the main utils.py file directly
|
11 |
-
import utils as root_utils
|
12 |
-
|
13 |
-
print(f"Imported utils from: {root_utils.__file__}")
|
14 |
-
print("Current create_descriptive_filename implementation:")
|
15 |
-
print(inspect.getsource(root_utils.create_descriptive_filename))
|
16 |
-
|
17 |
-
def main():
|
18 |
-
"""Test the filename formatting"""
|
19 |
-
# Sample inputs
|
20 |
-
sample_files = [
|
21 |
-
"handwritten-letter.jpg",
|
22 |
-
"magician-or-bottle-cungerer.jpg",
|
23 |
-
"baldwin_15th_north.jpg",
|
24 |
-
"harpers.pdf",
|
25 |
-
"recipe.jpg"
|
26 |
-
]
|
27 |
-
|
28 |
-
# Sample OCR results for testing
|
29 |
-
sample_results = [
|
30 |
-
{
|
31 |
-
"detected_document_type": "handwritten",
|
32 |
-
"topics": ["Letter", "Handwritten", "19th Century", "Personal Correspondence"]
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"topics": ["Newspaper", "Print", "19th Century", "Illustration", "Advertisement"]
|
36 |
-
},
|
37 |
-
{
|
38 |
-
"detected_document_type": "letter",
|
39 |
-
"topics": ["Correspondence", "Early Modern", "English Language"]
|
40 |
-
},
|
41 |
-
{
|
42 |
-
"detected_document_type": "magazine",
|
43 |
-
"topics": ["Publication", "Late 19th Century", "Magazine", "Historical"]
|
44 |
-
},
|
45 |
-
{
|
46 |
-
"detected_document_type": "recipe",
|
47 |
-
"topics": ["Food", "Culinary", "Historical", "Instruction"]
|
48 |
-
}
|
49 |
-
]
|
50 |
-
|
51 |
-
print("\nIMPROVED FILENAME FORMATTING TEST")
|
52 |
-
print("=" * 50)
|
53 |
-
|
54 |
-
# Format current date manually
|
55 |
-
current_date = datetime.datetime.now().strftime("%b %d, %Y")
|
56 |
-
print(f"Current date for filenames: {current_date}")
|
57 |
-
|
58 |
-
print("\nBEFORE vs AFTER Examples:\n")
|
59 |
-
|
60 |
-
for i, (original_file, result) in enumerate(zip(sample_files, sample_results)):
|
61 |
-
# Get file extension from original file
|
62 |
-
file_ext = os.path.splitext(original_file)[1]
|
63 |
-
|
64 |
-
# Generate the old style filename manually
|
65 |
-
original_name = os.path.splitext(original_file)[0]
|
66 |
-
|
67 |
-
doc_type_tag = ""
|
68 |
-
if 'detected_document_type' in result:
|
69 |
-
doc_type = result['detected_document_type'].lower()
|
70 |
-
doc_type_tag = f"_{doc_type.replace(' ', '_')}"
|
71 |
-
elif 'topics' in result and result['topics']:
|
72 |
-
doc_type_tag = f"_{result['topics'][0].lower().replace(' ', '_')}"
|
73 |
-
|
74 |
-
period_tag = ""
|
75 |
-
if 'topics' in result and result['topics']:
|
76 |
-
for tag in result['topics']:
|
77 |
-
if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
|
78 |
-
period_tag = f"_{tag.lower().replace(' ', '_')}"
|
79 |
-
break
|
80 |
-
|
81 |
-
old_filename = f"{original_name}{doc_type_tag}{period_tag}{file_ext}"
|
82 |
-
|
83 |
-
# Generate the new descriptive filename with our improved formatter
|
84 |
-
new_filename = root_utils.create_descriptive_filename(original_file, result, file_ext)
|
85 |
-
|
86 |
-
print(f"Example {i+1}:")
|
87 |
-
print(f" Original: {original_file}")
|
88 |
-
print(f" Old Format: {old_filename}")
|
89 |
-
print(f" New Format: {new_filename}")
|
90 |
-
print()
|
91 |
-
|
92 |
-
if __name__ == "__main__":
|
93 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
testing/test_json_bleed.py
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Test case to verify the fix for JSON bleed-through in historical text.
|
3 |
-
"""
|
4 |
-
import sys
|
5 |
-
import os
|
6 |
-
from pathlib import Path
|
7 |
-
|
8 |
-
# Add parent directory to path
|
9 |
-
sys.path.append(str(Path(__file__).parent.parent))
|
10 |
-
|
11 |
-
from utils.content_utils import format_structured_data
|
12 |
-
from utils.text_utils import clean_raw_text, format_markdown_text
|
13 |
-
|
14 |
-
# Sample text with JSON-like content (historical text with curly braces)
|
15 |
-
SAMPLE_TEXT = """# ENGLISH Credulity; or Ye're all Bottled.
|
16 |
-
|
17 |
-
O magnus pofldac Inimicis Rifus! Hor. Sat. WITH Grief, Refentment, and averted Eyes, Britannia droops to fee her Sons, (once Wile So fam'd for Arms, for Conduct fo renown'd With ev'ry Virtue ev'ry Glory crown'd) Now fink ignoble, and to nothing fall; Obedient marching forth at Folly's Call.
|
18 |
-
|
19 |
-
Text containing curly braces like these: { and } should not be parsed as JSON.
|
20 |
-
|
21 |
-
Even this text with a JSON-like pattern {"key": "value"} should be preserved as-is.
|
22 |
-
"""
|
23 |
-
|
24 |
-
def test_format_structured_data():
|
25 |
-
"""Test that format_structured_data preserves text content"""
|
26 |
-
result = format_structured_data(SAMPLE_TEXT)
|
27 |
-
|
28 |
-
# Verify the text is returned as-is without attempting to parse JSON-like structures
|
29 |
-
assert result == SAMPLE_TEXT
|
30 |
-
print("✓ format_structured_data correctly preserves text content")
|
31 |
-
|
32 |
-
# Make sure the output doesn't have any JSON code blocks
|
33 |
-
assert "```json" not in result
|
34 |
-
print("✓ format_structured_data does not create JSON code blocks")
|
35 |
-
|
36 |
-
return True
|
37 |
-
|
38 |
-
if __name__ == "__main__":
|
39 |
-
# Run the test
|
40 |
-
print("Running JSON bleed-through fix tests...\n")
|
41 |
-
success = test_format_structured_data()
|
42 |
-
|
43 |
-
if success:
|
44 |
-
print("\nAll tests passed! The JSON bleed-through issue is fixed.")
|
45 |
-
else:
|
46 |
-
print("\nSome tests failed.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|