File size: 3,619 Bytes
f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 1938ce7 f3c4f99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
from transformers import AutoModel, AutoProcessor
from PIL import Image
import torch
import json
import easyocr
import numpy as np
# Load EasyOCR Reader
reader = easyocr.Reader(['en'])
# Load LayoutLMv3 model and processor
def load_model():
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base")
model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
return processor, model
processor, model = load_model()
# OCR + Preprocessing for LayoutLMv3
def process_document(image):
try:
if not isinstance(image, Image.Image):
return None, "Error: Invalid image format. Please upload a valid image."
# OCR: Get text and boxes from EasyOCR
ocr_results = reader.readtext(np.array(image))
if not ocr_results:
return image, "No text detected."
words = []
boxes = []
for (bbox, text, confidence) in ocr_results:
if text.strip() == "":
continue
words.append(text)
# Convert bounding box to [x0, y0, x1, y1] format (top-left, bottom-right)
x_coords = [point[0] for point in bbox]
y_coords = [point[1] for point in bbox]
x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords))
boxes.append([x0, y0, x1, y1])
# Normalize boxes to LayoutLMv3 expected format (1000x1000)
width, height = image.size
normalized_boxes = []
for box in boxes:
x0, y0, x1, y1 = box
normalized_box = [
int(1000 * x0 / width),
int(1000 * y0 / height),
int(1000 * x1 / width),
int(1000 * y1 / height)
]
normalized_boxes.append(normalized_box)
# Encode inputs for LayoutLMv3
encoding = processor(image,
words=words,
boxes=normalized_boxes,
return_tensors="pt",
truncation=True,
padding="max_length")
with torch.no_grad():
outputs = model(**encoding)
# Use last hidden state or logits based on model
hidden = outputs.last_hidden_state
result = {
"status": "success",
"words": words,
"model_output_shape": str(hidden.shape),
"message": "Document processed with EasyOCR and LayoutLMv3."
}
return image, json.dumps(result, indent=2)
except Exception as e:
return image, f"Error processing document: {str(e)}"
# Gradio UI
with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo:
gr.Markdown("# π§Ύ Document Layout Analysis with LayoutLMv3 + EasyOCR")
gr.Markdown("Upload a document image (PNG, JPG, JPEG). Weβll extract the layout and text using EasyOCR.")
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="π Upload Document Image")
submit_button = gr.Button("π Process Document")
with gr.Column():
image_output = gr.Image(label="π· Uploaded Image")
text_output = gr.Textbox(label="π Analysis Results", lines=20)
submit_button.click(
fn=process_document,
inputs=image_input,
outputs=[image_output, text_output]
)
gr.Markdown("""
## π Instructions
1. Upload a document image.
2. Click "Process Document".
3. See the text extracted and model output.
""")
# Launch
demo.launch() |