Spaces:

darkbat
/

LayoutLMv3-Document-Analyzer

Running

File size: 3,619 Bytes

import gradio as gr
from transformers import AutoModel, AutoProcessor
from PIL import Image
import torch
import json
import easyocr
import numpy as np

# Load EasyOCR Reader
reader = easyocr.Reader(['en'])

# Load LayoutLMv3 model and processor
def load_model():
    processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base")
    model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
    return processor, model

processor, model = load_model()

# OCR + Preprocessing for LayoutLMv3
def process_document(image):
    try:
        if not isinstance(image, Image.Image):
            return None, "Error: Invalid image format. Please upload a valid image."

        # OCR: Get text and boxes from EasyOCR
        ocr_results = reader.readtext(np.array(image))

        if not ocr_results:
            return image, "No text detected."

        words = []
        boxes = []
        for (bbox, text, confidence) in ocr_results:
            if text.strip() == "":
                continue
            words.append(text)
            # Convert bounding box to [x0, y0, x1, y1] format (top-left, bottom-right)
            x_coords = [point[0] for point in bbox]
            y_coords = [point[1] for point in bbox]
            x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords))
            boxes.append([x0, y0, x1, y1])

        # Normalize boxes to LayoutLMv3 expected format (1000x1000)
        width, height = image.size
        normalized_boxes = []
        for box in boxes:
            x0, y0, x1, y1 = box
            normalized_box = [
                int(1000 * x0 / width),
                int(1000 * y0 / height),
                int(1000 * x1 / width),
                int(1000 * y1 / height)
            ]
            normalized_boxes.append(normalized_box)

        # Encode inputs for LayoutLMv3
        encoding = processor(image,
                             words=words,
                             boxes=normalized_boxes,
                             return_tensors="pt",
                             truncation=True,
                             padding="max_length")

        with torch.no_grad():
            outputs = model(**encoding)

        # Use last hidden state or logits based on model
        hidden = outputs.last_hidden_state
        result = {
            "status": "success",
            "words": words,
            "model_output_shape": str(hidden.shape),
            "message": "Document processed with EasyOCR and LayoutLMv3."
        }

        return image, json.dumps(result, indent=2)

    except Exception as e:
        return image, f"Error processing document: {str(e)}"

# Gradio UI
with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo:
    gr.Markdown("# 🧾 Document Layout Analysis with LayoutLMv3 + EasyOCR")
    gr.Markdown("Upload a document image (PNG, JPG, JPEG). We’ll extract the layout and text using EasyOCR.")

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="📄 Upload Document Image")
            submit_button = gr.Button("🔍 Process Document")
        with gr.Column():
            image_output = gr.Image(label="📷 Uploaded Image")
            text_output = gr.Textbox(label="📊 Analysis Results", lines=20)

    submit_button.click(
        fn=process_document,
        inputs=image_input,
        outputs=[image_output, text_output]
    )

    gr.Markdown("""
    ## 📌 Instructions
    1. Upload a document image.
    2. Click "Process Document".
    3. See the text extracted and model output.
    """)

# Launch
demo.launch()