File size: 3,619 Bytes
f3c4f99
 
 
 
 
1938ce7
 
f3c4f99
1938ce7
 
 
 
f3c4f99
1938ce7
f3c4f99
 
 
 
 
1938ce7
f3c4f99
 
 
 
1938ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3c4f99
 
1938ce7
 
 
f3c4f99
 
1938ce7
 
 
f3c4f99
1938ce7
f3c4f99
1938ce7
f3c4f99
 
 
1938ce7
 
 
 
 
f3c4f99
 
1938ce7
 
f3c4f99
1938ce7
 
 
f3c4f99
 
 
 
 
 
 
1938ce7
 
 
 
f3c4f99
 
1938ce7
f3c4f99
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
from transformers import AutoModel, AutoProcessor
from PIL import Image
import torch
import json
import easyocr
import numpy as np

# Load EasyOCR Reader
reader = easyocr.Reader(['en'])

# Load LayoutLMv3 model and processor
def load_model():
    processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base")
    model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
    return processor, model

processor, model = load_model()

# OCR + Preprocessing for LayoutLMv3
def process_document(image):
    try:
        if not isinstance(image, Image.Image):
            return None, "Error: Invalid image format. Please upload a valid image."

        # OCR: Get text and boxes from EasyOCR
        ocr_results = reader.readtext(np.array(image))

        if not ocr_results:
            return image, "No text detected."

        words = []
        boxes = []
        for (bbox, text, confidence) in ocr_results:
            if text.strip() == "":
                continue
            words.append(text)
            # Convert bounding box to [x0, y0, x1, y1] format (top-left, bottom-right)
            x_coords = [point[0] for point in bbox]
            y_coords = [point[1] for point in bbox]
            x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords))
            boxes.append([x0, y0, x1, y1])

        # Normalize boxes to LayoutLMv3 expected format (1000x1000)
        width, height = image.size
        normalized_boxes = []
        for box in boxes:
            x0, y0, x1, y1 = box
            normalized_box = [
                int(1000 * x0 / width),
                int(1000 * y0 / height),
                int(1000 * x1 / width),
                int(1000 * y1 / height)
            ]
            normalized_boxes.append(normalized_box)

        # Encode inputs for LayoutLMv3
        encoding = processor(image,
                             words=words,
                             boxes=normalized_boxes,
                             return_tensors="pt",
                             truncation=True,
                             padding="max_length")

        with torch.no_grad():
            outputs = model(**encoding)

        # Use last hidden state or logits based on model
        hidden = outputs.last_hidden_state
        result = {
            "status": "success",
            "words": words,
            "model_output_shape": str(hidden.shape),
            "message": "Document processed with EasyOCR and LayoutLMv3."
        }

        return image, json.dumps(result, indent=2)

    except Exception as e:
        return image, f"Error processing document: {str(e)}"

# Gradio UI
with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo:
    gr.Markdown("# 🧾 Document Layout Analysis with LayoutLMv3 + EasyOCR")
    gr.Markdown("Upload a document image (PNG, JPG, JPEG). We’ll extract the layout and text using EasyOCR.")

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="πŸ“„ Upload Document Image")
            submit_button = gr.Button("πŸ” Process Document")
        with gr.Column():
            image_output = gr.Image(label="πŸ“· Uploaded Image")
            text_output = gr.Textbox(label="πŸ“Š Analysis Results", lines=20)

    submit_button.click(
        fn=process_document,
        inputs=image_input,
        outputs=[image_output, text_output]
    )

    gr.Markdown("""
    ## πŸ“Œ Instructions
    1. Upload a document image.
    2. Click "Process Document".
    3. See the text extracted and model output.
    """)

# Launch
demo.launch()