Spaces:

milwright
/

historical-ocr

Running

historical-ocr / test_fix.py

Consolidate segmentation improvements and code cleanup

42dc069 3 months ago

1.65 kB

	#!/usr/bin/env python3
	import streamlit as st
	from ocr_processing import process_file

	# Mock a file upload
	class MockFile:
	def __init__(self, name, content):
	self.name = name
	self._content = content

	def getvalue(self):
	return self._content

	def main():
	# Load the test image - using the problematic image from the original task
	with open('input/magician-or-bottle-cungerer.jpg', 'rb') as f:
	file_bytes = f.read()

	# Create mock file
	uploaded_file = MockFile('magician-or-bottle-cungerer.jpg', file_bytes)

	# Process the file
	result = process_file(uploaded_file)

	# Display results
	print("\nDocument Content")
	print("Title")
	if 'title' in result['ocr_contents']:
	print(result['ocr_contents']['title'])

	print("\nMain")
	if 'main_text' in result['ocr_contents']:
	print(result['ocr_contents']['main_text'])

	print("\nRaw Text")
	if 'raw_text' in result['ocr_contents']:
	print(result['ocr_contents']['raw_text'][:300] + "...")

	# Debug: Print all keys in ocr_contents
	print("\nAll OCR Content Keys:")
	for key in result['ocr_contents'].keys():
	print(f"- {key}")

	# Debug: Display content of all keys
	print("\nContent of each key:")
	for key in result['ocr_contents'].keys():
	print(f"\n--- {key} ---")
	content = result['ocr_contents'][key]
	if isinstance(content, str):
	print(content[:150] + "..." if len(content) > 150 else content)
	else:
	print(f"Type: {type(content)}")

	if __name__ == "__main__":
	main()