milwright commited on
Commit
ebe8abd
·
verified ·
1 Parent(s): 5aadd21

Delete verify_fix.py

Browse files
Files changed (1) hide show
  1. verify_fix.py +0 -70
verify_fix.py DELETED
@@ -1,70 +0,0 @@
1
- #!/usr/bin/env python3
2
- import os
3
- import streamlit as st
4
- from ocr_processing import process_file
5
-
6
- # Mock a file upload
7
- class MockFile:
8
- def __init__(self, name, content):
9
- self.name = name
10
- self._content = content
11
-
12
- def getvalue(self):
13
- return self._content
14
-
15
- def test_image(image_path):
16
- """Test OCR processing for a specific image"""
17
- print(f"\n\n===== Testing {os.path.basename(image_path)} =====")
18
-
19
- # Load the test image
20
- with open(image_path, 'rb') as f:
21
- file_bytes = f.read()
22
-
23
- # Create mock file
24
- uploaded_file = MockFile(os.path.basename(image_path), file_bytes)
25
-
26
- # Process the file
27
- result = process_file(uploaded_file)
28
-
29
- # Display results summary
30
- print("\nOCR Content Keys:")
31
- for key in result['ocr_contents'].keys():
32
- print(f"- {key}")
33
-
34
- # Show a preview of raw_text
35
- if 'raw_text' in result['ocr_contents']:
36
- raw_text = result['ocr_contents']['raw_text']
37
- preview = raw_text[:100] + "..." if len(raw_text) > 100 else raw_text
38
- print(f"\nRaw Text Preview: {preview}")
39
-
40
- # Check for duplicated content
41
- found_duplicated = False
42
- if 'raw_text' in result['ocr_contents']:
43
- raw_text = result['ocr_contents']['raw_text']
44
- # Check if the same text appears twice in sequence (a sign of duplication)
45
- if len(raw_text) > 50:
46
- half_point = len(raw_text) // 2
47
- first_quarter = raw_text[:half_point//2].strip()
48
- if first_quarter and len(first_quarter) > 20:
49
- if first_quarter in raw_text[half_point:]:
50
- found_duplicated = True
51
- print("\n⚠️ WARNING: Possible text duplication detected!")
52
-
53
- if not found_duplicated:
54
- print("\n✅ No text duplication detected")
55
-
56
- return result
57
-
58
- def main():
59
- # Test with different image types
60
- test_files = [
61
- 'input/magician-or-bottle-cungerer.jpg', # The problematic file
62
- 'input/recipe.jpg', # Simple text file
63
- 'input/handwritten-letter.jpg' # Mixed content
64
- ]
65
-
66
- for image_path in test_files:
67
- test_image(image_path)
68
-
69
- if __name__ == "__main__":
70
- main()