import gradio as gr from transformers import AutoModel, AutoProcessor from PIL import Image import torch import json import easyocr import numpy as np # Load EasyOCR Reader reader = easyocr.Reader(['en']) # Load LayoutLMv3 model and processor def load_model(): processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base") model = AutoModel.from_pretrained("microsoft/layoutlmv3-base") return processor, model processor, model = load_model() # OCR + Preprocessing for LayoutLMv3 def process_document(image): try: if not isinstance(image, Image.Image): return None, "Error: Invalid image format. Please upload a valid image." # OCR: Get text and boxes from EasyOCR ocr_results = reader.readtext(np.array(image)) if not ocr_results: return image, "No text detected." words = [] boxes = [] for (bbox, text, confidence) in ocr_results: if text.strip() == "": continue words.append(text) # Convert bounding box to [x0, y0, x1, y1] format (top-left, bottom-right) x_coords = [point[0] for point in bbox] y_coords = [point[1] for point in bbox] x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords)) boxes.append([x0, y0, x1, y1]) # Normalize boxes to LayoutLMv3 expected format (1000x1000) width, height = image.size normalized_boxes = [] for box in boxes: x0, y0, x1, y1 = box normalized_box = [ int(1000 * x0 / width), int(1000 * y0 / height), int(1000 * x1 / width), int(1000 * y1 / height) ] normalized_boxes.append(normalized_box) # Encode inputs for LayoutLMv3 encoding = processor(image, words=words, boxes=normalized_boxes, return_tensors="pt", truncation=True, padding="max_length") with torch.no_grad(): outputs = model(**encoding) # Use last hidden state or logits based on model hidden = outputs.last_hidden_state result = { "status": "success", "words": words, "model_output_shape": str(hidden.shape), "message": "Document processed with EasyOCR and LayoutLMv3." } return image, json.dumps(result, indent=2) except Exception as e: return image, f"Error processing document: {str(e)}" # Gradio UI with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo: gr.Markdown("# ๐Ÿงพ Document Layout Analysis with LayoutLMv3 + EasyOCR") gr.Markdown("Upload a document image (PNG, JPG, JPEG). Weโ€™ll extract the layout and text using EasyOCR.") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="๐Ÿ“„ Upload Document Image") submit_button = gr.Button("๐Ÿ” Process Document") with gr.Column(): image_output = gr.Image(label="๐Ÿ“ท Uploaded Image") text_output = gr.Textbox(label="๐Ÿ“Š Analysis Results", lines=20) submit_button.click( fn=process_document, inputs=image_input, outputs=[image_output, text_output] ) gr.Markdown(""" ## ๐Ÿ“Œ Instructions 1. Upload a document image. 2. Click "Process Document". 3. See the text extracted and model output. """) # Launch demo.launch()