Spaces:

milwright
/

historical-ocr

Running

File size: 2,412 Bytes

59aaeae

#!/usr/bin/env python3
"""
PDFOCR - Module for processing PDF files with OCR and extracting structured data.
"""

import json
from pathlib import Path
from structured_ocr import StructuredOCR

class PDFOCR:
    """Class for processing PDF files with OCR and extracting structured data."""
    
    def __init__(self, api_key=None):
        """Initialize the PDF OCR processor."""
        self.processor = StructuredOCR(api_key=api_key)
    
    def process_pdf(self, pdf_path, use_vision=True):
        """
        Process a PDF file with OCR and extract structured data.
        
        Args:
            pdf_path: Path to the PDF file
            use_vision: Whether to use vision model for improved analysis
            
        Returns:
            Dictionary with structured OCR results
        """
        pdf_path = Path(pdf_path)
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
        
        return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
    
    def save_json_output(self, pdf_path, output_path, use_vision=True):
        """
        Process a PDF file and save the structured output as JSON.
        
        Args:
            pdf_path: Path to the PDF file
            output_path: Path where to save the JSON output
            use_vision: Whether to use vision model for improved analysis
            
        Returns:
            Path to the saved JSON file
        """
        # Process the PDF
        result = self.process_pdf(pdf_path, use_vision=use_vision)
        
        # Save the result to JSON
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        with open(output_path, 'w') as f:
            json.dump(result, f, indent=2)
        
        return output_path

# For testing directly
if __name__ == "__main__":
    import sys
    
    if len(sys.argv) < 2:
        print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
        sys.exit(1)
        
    pdf_path = sys.argv[1]
    output_path = sys.argv[2] if len(sys.argv) > 2 else None
    
    processor = PDFOCR()
    
    if output_path:
        result_path = processor.save_json_output(pdf_path, output_path)
        print(f"Results saved to: {result_path}")
    else:
        result = processor.process_pdf(pdf_path)
        print(json.dumps(result, indent=2))