Spaces:

milwright
/

historical-ocr

Running

File size: 1,647 Bytes

42dc069

#!/usr/bin/env python3
import streamlit as st
from ocr_processing import process_file

# Mock a file upload
class MockFile:
    def __init__(self, name, content):
        self.name = name
        self._content = content
    
    def getvalue(self):
        return self._content

def main():
    # Load the test image - using the problematic image from the original task
    with open('input/magician-or-bottle-cungerer.jpg', 'rb') as f:
        file_bytes = f.read()
    
    # Create mock file
    uploaded_file = MockFile('magician-or-bottle-cungerer.jpg', file_bytes)
    
    # Process the file
    result = process_file(uploaded_file)
    
    # Display results
    print("\nDocument Content")
    print("Title")
    if 'title' in result['ocr_contents']:
        print(result['ocr_contents']['title'])
    
    print("\nMain")
    if 'main_text' in result['ocr_contents']:
        print(result['ocr_contents']['main_text'])
    
    print("\nRaw Text")
    if 'raw_text' in result['ocr_contents']:
        print(result['ocr_contents']['raw_text'][:300] + "...")
    
    # Debug: Print all keys in ocr_contents
    print("\nAll OCR Content Keys:")
    for key in result['ocr_contents'].keys():
        print(f"- {key}")
        
    # Debug: Display content of all keys
    print("\nContent of each key:")
    for key in result['ocr_contents'].keys():
        print(f"\n--- {key} ---")
        content = result['ocr_contents'][key]
        if isinstance(content, str):
            print(content[:150] + "..." if len(content) > 150 else content)
        else:
            print(f"Type: {type(content)}")

if __name__ == "__main__":
    main()