darkbat's picture
Update app.py
1938ce7 verified
import gradio as gr
from transformers import AutoModel, AutoProcessor
from PIL import Image
import torch
import json
import easyocr
import numpy as np
# Load EasyOCR Reader
reader = easyocr.Reader(['en'])
# Load LayoutLMv3 model and processor
def load_model():
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base")
model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
return processor, model
processor, model = load_model()
# OCR + Preprocessing for LayoutLMv3
def process_document(image):
try:
if not isinstance(image, Image.Image):
return None, "Error: Invalid image format. Please upload a valid image."
# OCR: Get text and boxes from EasyOCR
ocr_results = reader.readtext(np.array(image))
if not ocr_results:
return image, "No text detected."
words = []
boxes = []
for (bbox, text, confidence) in ocr_results:
if text.strip() == "":
continue
words.append(text)
# Convert bounding box to [x0, y0, x1, y1] format (top-left, bottom-right)
x_coords = [point[0] for point in bbox]
y_coords = [point[1] for point in bbox]
x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords))
boxes.append([x0, y0, x1, y1])
# Normalize boxes to LayoutLMv3 expected format (1000x1000)
width, height = image.size
normalized_boxes = []
for box in boxes:
x0, y0, x1, y1 = box
normalized_box = [
int(1000 * x0 / width),
int(1000 * y0 / height),
int(1000 * x1 / width),
int(1000 * y1 / height)
]
normalized_boxes.append(normalized_box)
# Encode inputs for LayoutLMv3
encoding = processor(image,
words=words,
boxes=normalized_boxes,
return_tensors="pt",
truncation=True,
padding="max_length")
with torch.no_grad():
outputs = model(**encoding)
# Use last hidden state or logits based on model
hidden = outputs.last_hidden_state
result = {
"status": "success",
"words": words,
"model_output_shape": str(hidden.shape),
"message": "Document processed with EasyOCR and LayoutLMv3."
}
return image, json.dumps(result, indent=2)
except Exception as e:
return image, f"Error processing document: {str(e)}"
# Gradio UI
with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo:
gr.Markdown("# 🧾 Document Layout Analysis with LayoutLMv3 + EasyOCR")
gr.Markdown("Upload a document image (PNG, JPG, JPEG). We’ll extract the layout and text using EasyOCR.")
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="πŸ“„ Upload Document Image")
submit_button = gr.Button("πŸ” Process Document")
with gr.Column():
image_output = gr.Image(label="πŸ“· Uploaded Image")
text_output = gr.Textbox(label="πŸ“Š Analysis Results", lines=20)
submit_button.click(
fn=process_document,
inputs=image_input,
outputs=[image_output, text_output]
)
gr.Markdown("""
## πŸ“Œ Instructions
1. Upload a document image.
2. Click "Process Document".
3. See the text extracted and model output.
""")
# Launch
demo.launch()