Spaces:
Running
Running
#!/usr/bin/env python3 | |
import os | |
import streamlit as st | |
from ocr_processing import process_file | |
# Mock a file upload | |
class MockFile: | |
def __init__(self, name, content): | |
self.name = name | |
self._content = content | |
def getvalue(self): | |
return self._content | |
def test_image(image_path): | |
"""Test OCR processing for a specific image""" | |
print(f"\n\n===== Testing {os.path.basename(image_path)} =====") | |
# Load the test image | |
with open(image_path, 'rb') as f: | |
file_bytes = f.read() | |
# Create mock file | |
uploaded_file = MockFile(os.path.basename(image_path), file_bytes) | |
# Process the file | |
result = process_file(uploaded_file) | |
# Display results summary | |
print("\nOCR Content Keys:") | |
for key in result['ocr_contents'].keys(): | |
print(f"- {key}") | |
# Show a preview of raw_text | |
if 'raw_text' in result['ocr_contents']: | |
raw_text = result['ocr_contents']['raw_text'] | |
preview = raw_text[:100] + "..." if len(raw_text) > 100 else raw_text | |
print(f"\nRaw Text Preview: {preview}") | |
# Check for duplicated content | |
found_duplicated = False | |
if 'raw_text' in result['ocr_contents']: | |
raw_text = result['ocr_contents']['raw_text'] | |
# Check if the same text appears twice in sequence (a sign of duplication) | |
if len(raw_text) > 50: | |
half_point = len(raw_text) // 2 | |
first_quarter = raw_text[:half_point//2].strip() | |
if first_quarter and len(first_quarter) > 20: | |
if first_quarter in raw_text[half_point:]: | |
found_duplicated = True | |
print("\n⚠️ WARNING: Possible text duplication detected!") | |
if not found_duplicated: | |
print("\n✅ No text duplication detected") | |
return result | |
def main(): | |
# Test with different image types | |
test_files = [ | |
'input/magician-or-bottle-cungerer.jpg', # The problematic file | |
'input/recipe.jpg', # Simple text file | |
'input/handwritten-letter.jpg' # Mixed content | |
] | |
for image_path in test_files: | |
test_image(image_path) | |
if __name__ == "__main__": | |
main() | |