Spaces:

darkbat
/

LayoutLMv3-Document-Analyzer

Running

App Files Files Community

LayoutLMv3-Document-Analyzer / app.py

darkbat

Update app.py

1938ce7 verified about 2 months ago

raw

history blame contribute delete

3.62 kB

	import gradio as gr
	from transformers import AutoModel, AutoProcessor
	from PIL import Image
	import torch
	import json
	import easyocr
	import numpy as np

	# Load EasyOCR Reader
	reader = easyocr.Reader(['en'])

	# Load LayoutLMv3 model and processor
	def load_model():
	processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base")
	model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
	return processor, model

	processor, model = load_model()

	# OCR + Preprocessing for LayoutLMv3
	def process_document(image):
	try:
	if not isinstance(image, Image.Image):
	return None, "Error: Invalid image format. Please upload a valid image."

	# OCR: Get text and boxes from EasyOCR
	ocr_results = reader.readtext(np.array(image))

	if not ocr_results:
	return image, "No text detected."

	words = []
	boxes = []
	for (bbox, text, confidence) in ocr_results:
	if text.strip() == "":
	continue
	words.append(text)
	# Convert bounding box to [x0, y0, x1, y1] format (top-left, bottom-right)
	x_coords = [point[0] for point in bbox]
	y_coords = [point[1] for point in bbox]
	x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords))
	boxes.append([x0, y0, x1, y1])

	# Normalize boxes to LayoutLMv3 expected format (1000x1000)
	width, height = image.size
	normalized_boxes = []
	for box in boxes:
	x0, y0, x1, y1 = box
	normalized_box = [
	int(1000 * x0 / width),
	int(1000 * y0 / height),
	int(1000 * x1 / width),
	int(1000 * y1 / height)
	]
	normalized_boxes.append(normalized_box)

	# Encode inputs for LayoutLMv3
	encoding = processor(image,
	words=words,
	boxes=normalized_boxes,
	return_tensors="pt",
	truncation=True,
	padding="max_length")

	with torch.no_grad():
	outputs = model(**encoding)

	# Use last hidden state or logits based on model
	hidden = outputs.last_hidden_state
	result = {
	"status": "success",
	"words": words,
	"model_output_shape": str(hidden.shape),
	"message": "Document processed with EasyOCR and LayoutLMv3."
	}

	return image, json.dumps(result, indent=2)

	except Exception as e:
	return image, f"Error processing document: {str(e)}"

	# Gradio UI
	with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo:
	gr.Markdown("# 🧾 Document Layout Analysis with LayoutLMv3 + EasyOCR")
	gr.Markdown("Upload a document image (PNG, JPG, JPEG). We’ll extract the layout and text using EasyOCR.")

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="pil", label="📄 Upload Document Image")
	submit_button = gr.Button("🔍 Process Document")
	with gr.Column():
	image_output = gr.Image(label="📷 Uploaded Image")
	text_output = gr.Textbox(label="📊 Analysis Results", lines=20)

	submit_button.click(
	fn=process_document,
	inputs=image_input,
	outputs=[image_output, text_output]
	)

	gr.Markdown("""
	## 📌 Instructions
	1. Upload a document image.
	2. Click "Process Document".
	3. See the text extracted and model output.
	""")

	# Launch
	demo.launch()