historical-ocr / test_fix.py
milwright's picture
Consolidate segmentation improvements and code cleanup
42dc069
raw
history blame
1.65 kB
#!/usr/bin/env python3
import streamlit as st
from ocr_processing import process_file
# Mock a file upload
class MockFile:
def __init__(self, name, content):
self.name = name
self._content = content
def getvalue(self):
return self._content
def main():
# Load the test image - using the problematic image from the original task
with open('input/magician-or-bottle-cungerer.jpg', 'rb') as f:
file_bytes = f.read()
# Create mock file
uploaded_file = MockFile('magician-or-bottle-cungerer.jpg', file_bytes)
# Process the file
result = process_file(uploaded_file)
# Display results
print("\nDocument Content")
print("Title")
if 'title' in result['ocr_contents']:
print(result['ocr_contents']['title'])
print("\nMain")
if 'main_text' in result['ocr_contents']:
print(result['ocr_contents']['main_text'])
print("\nRaw Text")
if 'raw_text' in result['ocr_contents']:
print(result['ocr_contents']['raw_text'][:300] + "...")
# Debug: Print all keys in ocr_contents
print("\nAll OCR Content Keys:")
for key in result['ocr_contents'].keys():
print(f"- {key}")
# Debug: Display content of all keys
print("\nContent of each key:")
for key in result['ocr_contents'].keys():
print(f"\n--- {key} ---")
content = result['ocr_contents'][key]
if isinstance(content, str):
print(content[:150] + "..." if len(content) > 150 else content)
else:
print(f"Type: {type(content)}")
if __name__ == "__main__":
main()