Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| import os | |
| import streamlit as st | |
| from ocr_processing import process_file | |
| # Mock a file upload | |
| class MockFile: | |
| def __init__(self, name, content): | |
| self.name = name | |
| self._content = content | |
| def getvalue(self): | |
| return self._content | |
| def test_image(image_path): | |
| """Test OCR processing for a specific image""" | |
| print(f"\n\n===== Testing {os.path.basename(image_path)} =====") | |
| # Load the test image | |
| with open(image_path, 'rb') as f: | |
| file_bytes = f.read() | |
| # Create mock file | |
| uploaded_file = MockFile(os.path.basename(image_path), file_bytes) | |
| # Process the file | |
| result = process_file(uploaded_file) | |
| # Display results summary | |
| print("\nOCR Content Keys:") | |
| for key in result['ocr_contents'].keys(): | |
| print(f"- {key}") | |
| # Show a preview of raw_text | |
| if 'raw_text' in result['ocr_contents']: | |
| raw_text = result['ocr_contents']['raw_text'] | |
| preview = raw_text[:100] + "..." if len(raw_text) > 100 else raw_text | |
| print(f"\nRaw Text Preview: {preview}") | |
| # Check for duplicated content | |
| found_duplicated = False | |
| if 'raw_text' in result['ocr_contents']: | |
| raw_text = result['ocr_contents']['raw_text'] | |
| # Check if the same text appears twice in sequence (a sign of duplication) | |
| if len(raw_text) > 50: | |
| half_point = len(raw_text) // 2 | |
| first_quarter = raw_text[:half_point//2].strip() | |
| if first_quarter and len(first_quarter) > 20: | |
| if first_quarter in raw_text[half_point:]: | |
| found_duplicated = True | |
| print("\n⚠️ WARNING: Possible text duplication detected!") | |
| if not found_duplicated: | |
| print("\n✅ No text duplication detected") | |
| return result | |
| def main(): | |
| # Test with different image types | |
| test_files = [ | |
| 'input/magician-or-bottle-cungerer.jpg', # The problematic file | |
| 'input/recipe.jpg', # Simple text file | |
| 'input/handwritten-letter.jpg' # Mixed content | |
| ] | |
| for image_path in test_files: | |
| test_image(image_path) | |
| if __name__ == "__main__": | |
| main() | |