Spaces:
Sleeping
Sleeping
import torch | |
from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification | |
from PIL import Image | |
import numpy as np | |
import pytesseract | |
# Initialize the model and processor with caching | |
processor = None | |
model = None | |
def get_document_ai_models(): | |
"""Get or initialize document AI models with proper caching.""" | |
global processor, model | |
if processor is None: | |
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") | |
if model is None: | |
model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased") | |
return processor, model | |
def extract_text_with_tesseract(image): | |
"""Extract text using Tesseract OCR.""" | |
if isinstance(image, np.ndarray): | |
pil_image = Image.fromarray(image).convert("RGB") | |
else: | |
pil_image = image.convert("RGB") | |
# Use pytesseract for OCR | |
text = pytesseract.image_to_string(pil_image) | |
# Get word boxes for structure | |
boxes = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT) | |
# Extract words and their positions | |
words = [] | |
word_boxes = [] | |
for i in range(len(boxes['text'])): | |
if boxes['text'][i].strip() != '': | |
words.append(boxes['text'][i]) | |
x, y, w, h = boxes['left'][i], boxes['top'][i], boxes['width'][i], boxes['height'][i] | |
word_boxes.append([x, y, x + w, y + h]) | |
return words, word_boxes | |
def extract_text_and_layout(image): | |
""" | |
Extract text and layout information using OCR and LayoutLMv2. | |
Args: | |
image: PIL Image object | |
Returns: | |
Dictionary with extracted text and layout information | |
""" | |
# Convert numpy array to PIL Image if needed | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image).convert("RGB") | |
# Extract text using Tesseract | |
words, boxes = extract_text_with_tesseract(image) | |
# If no words were found, return empty result | |
if not words: | |
return { | |
'words': [], | |
'boxes': [], | |
'success': False | |
} | |
return { | |
'words': words, | |
'boxes': boxes, | |
'success': True | |
} | |