|
import gradio as gr |
|
from transformers import AutoModel, AutoProcessor |
|
from PIL import Image |
|
import torch |
|
import json |
|
import easyocr |
|
import numpy as np |
|
|
|
|
|
reader = easyocr.Reader(['en']) |
|
|
|
|
|
def load_model(): |
|
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base") |
|
model = AutoModel.from_pretrained("microsoft/layoutlmv3-base") |
|
return processor, model |
|
|
|
processor, model = load_model() |
|
|
|
|
|
def process_document(image): |
|
try: |
|
if not isinstance(image, Image.Image): |
|
return None, "Error: Invalid image format. Please upload a valid image." |
|
|
|
|
|
ocr_results = reader.readtext(np.array(image)) |
|
|
|
if not ocr_results: |
|
return image, "No text detected." |
|
|
|
words = [] |
|
boxes = [] |
|
for (bbox, text, confidence) in ocr_results: |
|
if text.strip() == "": |
|
continue |
|
words.append(text) |
|
|
|
x_coords = [point[0] for point in bbox] |
|
y_coords = [point[1] for point in bbox] |
|
x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords)) |
|
boxes.append([x0, y0, x1, y1]) |
|
|
|
|
|
width, height = image.size |
|
normalized_boxes = [] |
|
for box in boxes: |
|
x0, y0, x1, y1 = box |
|
normalized_box = [ |
|
int(1000 * x0 / width), |
|
int(1000 * y0 / height), |
|
int(1000 * x1 / width), |
|
int(1000 * y1 / height) |
|
] |
|
normalized_boxes.append(normalized_box) |
|
|
|
|
|
encoding = processor(image, |
|
words=words, |
|
boxes=normalized_boxes, |
|
return_tensors="pt", |
|
truncation=True, |
|
padding="max_length") |
|
|
|
with torch.no_grad(): |
|
outputs = model(**encoding) |
|
|
|
|
|
hidden = outputs.last_hidden_state |
|
result = { |
|
"status": "success", |
|
"words": words, |
|
"model_output_shape": str(hidden.shape), |
|
"message": "Document processed with EasyOCR and LayoutLMv3." |
|
} |
|
|
|
return image, json.dumps(result, indent=2) |
|
|
|
except Exception as e: |
|
return image, f"Error processing document: {str(e)}" |
|
|
|
|
|
with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo: |
|
gr.Markdown("# π§Ύ Document Layout Analysis with LayoutLMv3 + EasyOCR") |
|
gr.Markdown("Upload a document image (PNG, JPG, JPEG). Weβll extract the layout and text using EasyOCR.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
image_input = gr.Image(type="pil", label="π Upload Document Image") |
|
submit_button = gr.Button("π Process Document") |
|
with gr.Column(): |
|
image_output = gr.Image(label="π· Uploaded Image") |
|
text_output = gr.Textbox(label="π Analysis Results", lines=20) |
|
|
|
submit_button.click( |
|
fn=process_document, |
|
inputs=image_input, |
|
outputs=[image_output, text_output] |
|
) |
|
|
|
gr.Markdown(""" |
|
## π Instructions |
|
1. Upload a document image. |
|
2. Click "Process Document". |
|
3. See the text extracted and model output. |
|
""") |
|
|
|
|
|
demo.launch() |