import torch from transformers import pipeline, TrOCRProcessor, VisionEncoderDecoderModel from transformers import AutoModel, AutoProcessor from PIL import Image import requests # Method 1: Using TrOCR with pipeline (easiest approach) print("Loading TrOCR model using pipeline...") trocr_pipe = pipeline("image-to-text", model="microsoft/trocr-base-handwritten") # Method 2: Loading TrOCR model directly (more control) print("Loading TrOCR model directly...") trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") # Method 3: Loading LayoutLMv3 model (for document layout understanding) print("Loading LayoutLMv3 model...") layoutlm_processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base") layoutlm_model = AutoModel.from_pretrained("microsoft/layoutlmv3-base") def extract_text_with_trocr_pipeline(image_path): """ Extract text from handwritten image using TrOCR pipeline """ try: image = Image.open(image_path) result = trocr_pipe(image) return result[0]['generated_text'] except Exception as e: print(f"Error processing image with pipeline: {e}") return None def extract_text_with_trocr_direct(image_path): """ Extract text from handwritten image using TrOCR model directly """ try: image = Image.open(image_path) pixel_values = trocr_processor(image, return_tensors="pt").pixel_values generated_ids = trocr_model.generate(pixel_values) generated_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_text except Exception as e: print(f"Error processing image with direct model: {e}") return None def analyze_document_layout(image_path): """ Analyze document layout using LayoutLMv3 Note: This is a basic example. LayoutLMv3 typically requires fine-tuning for specific tasks """ try: image = Image.open(image_path) # For LayoutLMv3, you typically need text and bounding boxes # This is a simplified example - in practice, you'd need OCR results first encoding = layoutlm_processor(image, return_tensors="pt") with torch.no_grad(): outputs = layoutlm_model(**encoding) # The outputs contain embeddings that can be used for downstream tasks return outputs.last_hidden_state except Exception as e: print(f"Error analyzing document layout: {e}") return None def process_bill_image(image_path): """ Complete pipeline to process a Singtel bill image """ print(f"Processing bill image: {image_path}") # Extract text using TrOCR print("Extracting text with TrOCR...") extracted_text = extract_text_with_trocr_pipeline(image_path) if extracted_text: print(f"Extracted text: {extracted_text}") # You can add bill-specific parsing logic here # For example, looking for patterns like: # - Account numbers # - Amounts due # - Due dates # - Service charges return { 'extracted_text': extracted_text, 'status': 'success' } else: return { 'extracted_text': None, 'status': 'failed' } # Example usage if __name__ == "__main__": print("=== Singtel Bill Scanner Test ===") # Replace with your actual image path # image_path = "path/to/your/bill_image.jpg" # For testing with a sample image (you can download this) sample_url = "https://huggingface.co/microsoft/trocr-base-handwritten/resolve/main/images/example_1.jpg" print("\nTesting with sample image...") try: # Download sample image for testing response = requests.get(sample_url) with open("sample_handwritten.jpg", "wb") as f: f.write(response.content) # Test the models result = process_bill_image("sample_handwritten.jpg") print(f"Result: {result}") except Exception as e: print(f"Error downloading sample image: {e}") print("Please provide your own image path to test the models") print("\n=== Usage Instructions ===") print("1. To use with your own image:") print(" result = process_bill_image('path/to/your/bill.jpg')") print("\n2. For pipeline approach:") print(" text = extract_text_with_trocr_pipeline('image.jpg')") print("\n3. For direct model approach:") print(" text = extract_text_with_trocr_direct('image.jpg')") print("\n4. For document layout analysis:") print(" layout = analyze_document_layout('image.jpg')")