historical-ocr / testing /test_filename_format.py
milwright's picture
Consolidate segmentation improvements and code cleanup
42dc069
"""Test the new filename formatting"""
import os
import sys
import datetime
import inspect
# Add the project root to the path so we can import modules
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
# Import the main utils.py file directly
import utils as root_utils
print(f"Imported utils from: {root_utils.__file__}")
print("Current create_descriptive_filename implementation:")
print(inspect.getsource(root_utils.create_descriptive_filename))
def main():
"""Test the filename formatting"""
# Sample inputs
sample_files = [
"handwritten-letter.jpg",
"magician-or-bottle-cungerer.jpg",
"baldwin_15th_north.jpg",
"harpers.pdf",
"recipe.jpg"
]
# Sample OCR results for testing
sample_results = [
{
"detected_document_type": "handwritten",
"topics": ["Letter", "Handwritten", "19th Century", "Personal Correspondence"]
},
{
"topics": ["Newspaper", "Print", "19th Century", "Illustration", "Advertisement"]
},
{
"detected_document_type": "letter",
"topics": ["Correspondence", "Early Modern", "English Language"]
},
{
"detected_document_type": "magazine",
"topics": ["Publication", "Late 19th Century", "Magazine", "Historical"]
},
{
"detected_document_type": "recipe",
"topics": ["Food", "Culinary", "Historical", "Instruction"]
}
]
print("\nIMPROVED FILENAME FORMATTING TEST")
print("=" * 50)
# Format current date manually
current_date = datetime.datetime.now().strftime("%b %d, %Y")
print(f"Current date for filenames: {current_date}")
print("\nBEFORE vs AFTER Examples:\n")
for i, (original_file, result) in enumerate(zip(sample_files, sample_results)):
# Get file extension from original file
file_ext = os.path.splitext(original_file)[1]
# Generate the old style filename manually
original_name = os.path.splitext(original_file)[0]
doc_type_tag = ""
if 'detected_document_type' in result:
doc_type = result['detected_document_type'].lower()
doc_type_tag = f"_{doc_type.replace(' ', '_')}"
elif 'topics' in result and result['topics']:
doc_type_tag = f"_{result['topics'][0].lower().replace(' ', '_')}"
period_tag = ""
if 'topics' in result and result['topics']:
for tag in result['topics']:
if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
period_tag = f"_{tag.lower().replace(' ', '_')}"
break
old_filename = f"{original_name}{doc_type_tag}{period_tag}{file_ext}"
# Generate the new descriptive filename with our improved formatter
new_filename = root_utils.create_descriptive_filename(original_file, result, file_ext)
print(f"Example {i+1}:")
print(f" Original: {original_file}")
print(f" Old Format: {old_filename}")
print(f" New Format: {new_filename}")
print()
if __name__ == "__main__":
main()