milwright commited on
Commit
cfec41e
·
verified ·
1 Parent(s): 32b817f

Delete testing

Browse files
testing/magician_test/branch_comparison.txt DELETED
@@ -1,20 +0,0 @@
1
- Comparison of ocr_utils.py between main and reconcile-improvements branches
2
- ==================================================================
3
-
4
- Key improvements in reconcile-improvements branch:
5
-
6
- 1. Enhanced illustration/etching detection:
7
- - Added detection based on filename keywords (e.g., 'magician', 'illustration')
8
- - Implemented image-based detection using edge density analysis
9
-
10
- 2. Specialized processing for illustrations:
11
- - Gentler scaling to preserve fine details
12
- - Mild contrast enhancement (1.3 vs. higher values for other documents)
13
- - Specialized sharpening for fine lines in etchings
14
- - Higher quality settings (95 vs. 85) to prevent detail loss
15
-
16
- 3. Performance optimizations:
17
- - More efficient processing paths for different image types
18
- - Better memory management for large images
19
-
20
- Test results for magician-or-bottle-cungerer.jpg demonstrate these improvements.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
testing/magician_test/processed_magician.jpg DELETED

Git LFS Details

  • SHA256: 8824abe6e81e6b7847eca83e39fda77c3b6937d292f3647078ba4af2531d65ff
  • Pointer size: 132 Bytes
  • Size of remote file: 2.33 MB
testing/magician_test/test_report.txt DELETED
@@ -1,16 +0,0 @@
1
- Test Report: Magician Image Processing
2
- =====================================
3
-
4
- Original image: input/magician-or-bottle-cungerer.jpg
5
- Original size: 2500x2116
6
- Processed size: 2500x2116
7
- Processing time: 0.58 seconds
8
- Size reduction: 0.00%
9
-
10
- Illustration Detection:
11
- - Filename contains 'magician': True
12
-
13
- Visual Inspection Notes:
14
- - Check processed_magician.jpg for preservation of fine details
15
- - Verify that etching lines are clear and not over-processed
16
- - Confirm that contrast enhancement is appropriate for this illustration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
testing/newspaper_test/newspaper_comparison.jpg DELETED

Git LFS Details

  • SHA256: 1a48abfd88f516f704f574b8d3d372c07d2c71a82e5743eae205aece7d77c2de
  • Pointer size: 132 Bytes
  • Size of remote file: 3.58 MB
testing/newspaper_test/newspaper_test_report.txt DELETED
@@ -1,18 +0,0 @@
1
- Newspaper Detection Test Report
2
- ==============================
3
-
4
- Original image: input/magician-or-bottle-cungerer.jpg
5
- Original size: 2500x2116
6
- Processed size: 2000x1692
7
- Processing time: 0.71 seconds
8
-
9
- Aspect ratio: 1.18
10
- Meets newspaper criteria by dimensions: False
11
-
12
- Size reduction: 36.03%
13
-
14
- Notes on Newspaper Processing:
15
- - Newspaper format should be detected based on dimensions and aspect ratio
16
- - Specialized processing should be applied for newspaper text extraction
17
- - Check if the processed image shows enhanced text clarity in columns
18
- - Verify that the column structure is preserved for better OCR results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
testing/newspaper_test/processed_newspaper.jpg DELETED

Git LFS Details

  • SHA256: c1a856a643e381b7312ca16931ca33a3b670dbf456357f8a7c5e91fd92ce7b5f
  • Pointer size: 132 Bytes
  • Size of remote file: 1.6 MB
testing/output/processed_magician.jpg DELETED

Git LFS Details

  • SHA256: 8824abe6e81e6b7847eca83e39fda77c3b6937d292f3647078ba4af2531d65ff
  • Pointer size: 132 Bytes
  • Size of remote file: 2.33 MB
testing/output/test_report.txt DELETED
@@ -1,16 +0,0 @@
1
- Test Report: Magician Image Processing
2
- =====================================
3
-
4
- Original image: input/magician-or-bottle-cungerer.jpg
5
- Original size: 2500x2116
6
- Processed size: 2500x2116
7
- Processing time: 0.58 seconds
8
- Size reduction: 0.00%
9
-
10
- Illustration Detection:
11
- - Filename contains 'magician': True
12
-
13
- Visual Inspection Notes:
14
- - Check processed_magician.jpg for preservation of fine details
15
- - Verify that etching lines are clear and not over-processed
16
- - Confirm that contrast enhancement is appropriate for this illustration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
testing/test_filename_format.py DELETED
@@ -1,93 +0,0 @@
1
- """Test the new filename formatting"""
2
- import os
3
- import sys
4
- import datetime
5
- import inspect
6
-
7
- # Add the project root to the path so we can import modules
8
- sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
-
10
- # Import the main utils.py file directly
11
- import utils as root_utils
12
-
13
- print(f"Imported utils from: {root_utils.__file__}")
14
- print("Current create_descriptive_filename implementation:")
15
- print(inspect.getsource(root_utils.create_descriptive_filename))
16
-
17
- def main():
18
- """Test the filename formatting"""
19
- # Sample inputs
20
- sample_files = [
21
- "handwritten-letter.jpg",
22
- "magician-or-bottle-cungerer.jpg",
23
- "baldwin_15th_north.jpg",
24
- "harpers.pdf",
25
- "recipe.jpg"
26
- ]
27
-
28
- # Sample OCR results for testing
29
- sample_results = [
30
- {
31
- "detected_document_type": "handwritten",
32
- "topics": ["Letter", "Handwritten", "19th Century", "Personal Correspondence"]
33
- },
34
- {
35
- "topics": ["Newspaper", "Print", "19th Century", "Illustration", "Advertisement"]
36
- },
37
- {
38
- "detected_document_type": "letter",
39
- "topics": ["Correspondence", "Early Modern", "English Language"]
40
- },
41
- {
42
- "detected_document_type": "magazine",
43
- "topics": ["Publication", "Late 19th Century", "Magazine", "Historical"]
44
- },
45
- {
46
- "detected_document_type": "recipe",
47
- "topics": ["Food", "Culinary", "Historical", "Instruction"]
48
- }
49
- ]
50
-
51
- print("\nIMPROVED FILENAME FORMATTING TEST")
52
- print("=" * 50)
53
-
54
- # Format current date manually
55
- current_date = datetime.datetime.now().strftime("%b %d, %Y")
56
- print(f"Current date for filenames: {current_date}")
57
-
58
- print("\nBEFORE vs AFTER Examples:\n")
59
-
60
- for i, (original_file, result) in enumerate(zip(sample_files, sample_results)):
61
- # Get file extension from original file
62
- file_ext = os.path.splitext(original_file)[1]
63
-
64
- # Generate the old style filename manually
65
- original_name = os.path.splitext(original_file)[0]
66
-
67
- doc_type_tag = ""
68
- if 'detected_document_type' in result:
69
- doc_type = result['detected_document_type'].lower()
70
- doc_type_tag = f"_{doc_type.replace(' ', '_')}"
71
- elif 'topics' in result and result['topics']:
72
- doc_type_tag = f"_{result['topics'][0].lower().replace(' ', '_')}"
73
-
74
- period_tag = ""
75
- if 'topics' in result and result['topics']:
76
- for tag in result['topics']:
77
- if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
78
- period_tag = f"_{tag.lower().replace(' ', '_')}"
79
- break
80
-
81
- old_filename = f"{original_name}{doc_type_tag}{period_tag}{file_ext}"
82
-
83
- # Generate the new descriptive filename with our improved formatter
84
- new_filename = root_utils.create_descriptive_filename(original_file, result, file_ext)
85
-
86
- print(f"Example {i+1}:")
87
- print(f" Original: {original_file}")
88
- print(f" Old Format: {old_filename}")
89
- print(f" New Format: {new_filename}")
90
- print()
91
-
92
- if __name__ == "__main__":
93
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
testing/test_json_bleed.py DELETED
@@ -1,46 +0,0 @@
1
- """
2
- Test case to verify the fix for JSON bleed-through in historical text.
3
- """
4
- import sys
5
- import os
6
- from pathlib import Path
7
-
8
- # Add parent directory to path
9
- sys.path.append(str(Path(__file__).parent.parent))
10
-
11
- from utils.content_utils import format_structured_data
12
- from utils.text_utils import clean_raw_text, format_markdown_text
13
-
14
- # Sample text with JSON-like content (historical text with curly braces)
15
- SAMPLE_TEXT = """# ENGLISH Credulity; or Ye're all Bottled.
16
-
17
- O magnus pofldac Inimicis Rifus! Hor. Sat. WITH Grief, Refentment, and averted Eyes, Britannia droops to fee her Sons, (once Wile So fam'd for Arms, for Conduct fo renown'd With ev'ry Virtue ev'ry Glory crown'd) Now fink ignoble, and to nothing fall; Obedient marching forth at Folly's Call.
18
-
19
- Text containing curly braces like these: { and } should not be parsed as JSON.
20
-
21
- Even this text with a JSON-like pattern {"key": "value"} should be preserved as-is.
22
- """
23
-
24
- def test_format_structured_data():
25
- """Test that format_structured_data preserves text content"""
26
- result = format_structured_data(SAMPLE_TEXT)
27
-
28
- # Verify the text is returned as-is without attempting to parse JSON-like structures
29
- assert result == SAMPLE_TEXT
30
- print("✓ format_structured_data correctly preserves text content")
31
-
32
- # Make sure the output doesn't have any JSON code blocks
33
- assert "```json" not in result
34
- print("✓ format_structured_data does not create JSON code blocks")
35
-
36
- return True
37
-
38
- if __name__ == "__main__":
39
- # Run the test
40
- print("Running JSON bleed-through fix tests...\n")
41
- success = test_format_structured_data()
42
-
43
- if success:
44
- print("\nAll tests passed! The JSON bleed-through issue is fixed.")
45
- else:
46
- print("\nSome tests failed.")