Spaces:
Running
Running
File size: 4,511 Bytes
70d7f43 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import torch
from PIL import Image
import numpy as np
import os
import sys
# Try to import pytesseract, but handle if it's not available
try:
import pytesseract
TESSERACT_AVAILABLE = True
except ImportError:
TESSERACT_AVAILABLE = False
# Check if tesseract is installed
if TESSERACT_AVAILABLE:
try:
pytesseract.get_tesseract_version()
except Exception:
TESSERACT_AVAILABLE = False
# Initialize the model and processor with caching
processor = None
model = None
def get_document_ai_models():
"""Get or initialize document AI models with proper caching."""
global processor, model
if processor is None:
from transformers import LayoutLMv2Processor
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
if model is None:
from transformers import LayoutLMv2ForSequenceClassification
model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
return processor, model
def extract_text_with_tesseract(image):
"""Extract text using Tesseract OCR."""
if not TESSERACT_AVAILABLE:
raise RuntimeError("tesseract is not installed or it's not in your PATH. See README file for more information.")
if isinstance(image, np.ndarray):
pil_image = Image.fromarray(image).convert("RGB")
else:
pil_image = image.convert("RGB")
# Use pytesseract for OCR
text = pytesseract.image_to_string(pil_image)
# Get word boxes for structure
boxes = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
# Extract words and their positions
words = []
word_boxes = []
for i in range(len(boxes['text'])):
if boxes['text'][i].strip() != '':
words.append(boxes['text'][i])
x, y, w, h = boxes['left'][i], boxes['top'][i], boxes['width'][i], boxes['height'][i]
word_boxes.append([x, y, x + w, y + h])
return words, word_boxes
def extract_text_with_transformers(image):
"""Extract text using transformers models when Tesseract is not available."""
try:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
# Initialize the processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
# Prepare the image
if isinstance(image, np.ndarray):
pil_image = Image.fromarray(image).convert("RGB")
else:
pil_image = image.convert("RGB")
# Process the image
pixel_values = processor(pil_image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Split into words
words = generated_text.split()
# Since we don't have bounding boxes, return empty boxes
word_boxes = [[0, 0, 0, 0] for _ in words]
return words, word_boxes
except Exception as e:
# If transformers OCR fails, return a simple error message
return ["Error extracting text with transformers OCR:", str(e)], [[0, 0, 0, 0], [0, 0, 0, 0]]
def extract_text_and_layout(image):
"""
Extract text and layout information using OCR.
Args:
image: PIL Image object
Returns:
Dictionary with extracted text and layout information
"""
# Convert numpy array to PIL Image if needed
if isinstance(image, np.ndarray):
image = Image.fromarray(image).convert("RGB")
try:
# Try Tesseract first
if TESSERACT_AVAILABLE:
words, boxes = extract_text_with_tesseract(image)
else:
# Fall back to transformers OCR
words, boxes = extract_text_with_transformers(image)
except Exception as e:
# If both methods fail, return the error
return {
'words': [f"Error extracting text: {str(e)}"],
'boxes': [[0, 0, 0, 0]],
'success': False
}
# If no words were found, return empty result
if not words:
return {
'words': [],
'boxes': [],
'success': False
}
return {
'words': words,
'boxes': boxes,
'success': True
}
|