File size: 4,511 Bytes
70d7f43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import torch
from PIL import Image
import numpy as np
import os
import sys

# Try to import pytesseract, but handle if it's not available
try:
    import pytesseract
    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False

# Check if tesseract is installed
if TESSERACT_AVAILABLE:
    try:
        pytesseract.get_tesseract_version()
    except Exception:
        TESSERACT_AVAILABLE = False

# Initialize the model and processor with caching
processor = None
model = None

def get_document_ai_models():
    """Get or initialize document AI models with proper caching."""
    global processor, model
    if processor is None:
        from transformers import LayoutLMv2Processor
        processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
    if model is None:
        from transformers import LayoutLMv2ForSequenceClassification
        model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
    return processor, model

def extract_text_with_tesseract(image):
    """Extract text using Tesseract OCR."""
    if not TESSERACT_AVAILABLE:
        raise RuntimeError("tesseract is not installed or it's not in your PATH. See README file for more information.")
        
    if isinstance(image, np.ndarray):
        pil_image = Image.fromarray(image).convert("RGB")
    else:
        pil_image = image.convert("RGB")
    
    # Use pytesseract for OCR
    text = pytesseract.image_to_string(pil_image)
    
    # Get word boxes for structure
    boxes = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
    
    # Extract words and their positions
    words = []
    word_boxes = []
    
    for i in range(len(boxes['text'])):
        if boxes['text'][i].strip() != '':
            words.append(boxes['text'][i])
            x, y, w, h = boxes['left'][i], boxes['top'][i], boxes['width'][i], boxes['height'][i]
            word_boxes.append([x, y, x + w, y + h])
    
    return words, word_boxes

def extract_text_with_transformers(image):
    """Extract text using transformers models when Tesseract is not available."""
    try:
        from transformers import TrOCRProcessor, VisionEncoderDecoderModel
        
        # Initialize the processor and model
        processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
        
        # Prepare the image
        if isinstance(image, np.ndarray):
            pil_image = Image.fromarray(image).convert("RGB")
        else:
            pil_image = image.convert("RGB")
            
        # Process the image
        pixel_values = processor(pil_image, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        
        # Split into words
        words = generated_text.split()
        
        # Since we don't have bounding boxes, return empty boxes
        word_boxes = [[0, 0, 0, 0] for _ in words]
        
        return words, word_boxes
        
    except Exception as e:
        # If transformers OCR fails, return a simple error message
        return ["Error extracting text with transformers OCR:", str(e)], [[0, 0, 0, 0], [0, 0, 0, 0]]

def extract_text_and_layout(image):
    """
    Extract text and layout information using OCR.
    
    Args:
        image: PIL Image object
        
    Returns:
        Dictionary with extracted text and layout information
    """
    # Convert numpy array to PIL Image if needed
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image).convert("RGB")
    
    try:
        # Try Tesseract first
        if TESSERACT_AVAILABLE:
            words, boxes = extract_text_with_tesseract(image)
        else:
            # Fall back to transformers OCR
            words, boxes = extract_text_with_transformers(image)
    except Exception as e:
        # If both methods fail, return the error
        return {
            'words': [f"Error extracting text: {str(e)}"],
            'boxes': [[0, 0, 0, 0]],
            'success': False
        }
    
    # If no words were found, return empty result
    if not words:
        return {
            'words': [],
            'boxes': [],
            'success': False
        }
    
    return {
        'words': words,
        'boxes': boxes,
        'success': True
    }