Spaces:

darkbat
/

LayoutLMv3-Document-Analyzer

Running

App Files Files Community

darkbat commited on Jun 29

Commit

1938ce7

verified ·

1 Parent(s): 00491e8

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -34

app.py CHANGED Viewed

@@ -3,57 +3,95 @@ from transformers import AutoModel, AutoProcessor
 from PIL import Image
 import torch
 import json
-# Load the LayoutLMv3 model and processor
 def load_model():
-    processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True)
     model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
     return processor, model
 processor, model = load_model()
-# Function to process the uploaded image
 def process_document(image):
     try:
-        # Ensure image is a PIL Image (Gradio provides it as PIL with type="pil")
         if not isinstance(image, Image.Image):
             return None, "Error: Invalid image format. Please upload a valid image."
-        # Preprocess the image with the processor
-        encoding = processor(image, return_tensors="pt")
-        # Run the model
         with torch.no_grad():
             outputs = model(**encoding)
-        # Extract logits or embeddings (modify based on your task)
-        logits = outputs.logits if hasattr(outputs, 'logits') else outputs.last_hidden_state
-        # Placeholder result; customize based on your task (e.g., token classification, text extraction)
         result = {
             "status": "success",
-            "model_output_shape": str(logits.shape),
-            "message": "Document processed successfully. Customize this section for specific outputs."
         }
         return image, json.dumps(result, indent=2)
     except Exception as e:
         return image, f"Error processing document: {str(e)}"
-# Gradio Interface
-with gr.Blocks(title="Document Analysis with LayoutLMv3") as demo:
-    gr.Markdown("# Document Analysis with LayoutLMv3")
-    gr.Markdown("Upload a document image (PNG, JPG, JPEG) to analyze its layout and extract text.")
     with gr.Row():
         with gr.Column():
-            image_input = gr.Image(type="pil", label="Upload Document Image")
-            submit_button = gr.Button("Process Document")
         with gr.Column():
-            image_output = gr.Image(label="Uploaded Image")
-            text_output = gr.Textbox(label="Analysis Results")
     submit_button.click(
         fn=process_document,
         inputs=image_input,
@@ -61,12 +99,11 @@ with gr.Blocks(title="Document Analysis with LayoutLMv3") as demo:
     )
     gr.Markdown("""
-    ### Instructions
-    1. Upload a document image (PNG, JPG, or JPEG).
-    2. Click "Process Document" to analyze the image.
-    3. View the results in the output section.
-    4. This is a basic demo; customize the output processing for specific tasks (e.g., text extraction, layout analysis).
     """)
-# Launch the Gradio app
 demo.launch()

 from PIL import Image
 import torch
 import json
+import easyocr
+import numpy as np
+# Load EasyOCR Reader
+reader = easyocr.Reader(['en'])
+# Load LayoutLMv3 model and processor
 def load_model():
+    processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base")
     model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
     return processor, model
 processor, model = load_model()
+# OCR + Preprocessing for LayoutLMv3
 def process_document(image):
     try:
         if not isinstance(image, Image.Image):
             return None, "Error: Invalid image format. Please upload a valid image."
+        # OCR: Get text and boxes from EasyOCR
+        ocr_results = reader.readtext(np.array(image))
+        if not ocr_results:
+            return image, "No text detected."
+        words = []
+        boxes = []
+        for (bbox, text, confidence) in ocr_results:
+            if text.strip() == "":
+                continue
+            words.append(text)
+            # Convert bounding box to [x0, y0, x1, y1] format (top-left, bottom-right)
+            x_coords = [point[0] for point in bbox]
+            y_coords = [point[1] for point in bbox]
+            x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords))
+            boxes.append([x0, y0, x1, y1])
+        # Normalize boxes to LayoutLMv3 expected format (1000x1000)
+        width, height = image.size
+        normalized_boxes = []
+        for box in boxes:
+            x0, y0, x1, y1 = box
+            normalized_box = [
+                int(1000 * x0 / width),
+                int(1000 * y0 / height),
+                int(1000 * x1 / width),
+                int(1000 * y1 / height)
+            ]
+            normalized_boxes.append(normalized_box)
+        # Encode inputs for LayoutLMv3
+        encoding = processor(image,
+                             words=words,
+                             boxes=normalized_boxes,
+                             return_tensors="pt",
+                             truncation=True,
+                             padding="max_length")
         with torch.no_grad():
             outputs = model(**encoding)
+        # Use last hidden state or logits based on model
+        hidden = outputs.last_hidden_state
         result = {
             "status": "success",
+            "words": words,
+            "model_output_shape": str(hidden.shape),
+            "message": "Document processed with EasyOCR and LayoutLMv3."
         }
         return image, json.dumps(result, indent=2)
     except Exception as e:
         return image, f"Error processing document: {str(e)}"
+# Gradio UI
+with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo:
+    gr.Markdown("# 🧾 Document Layout Analysis with LayoutLMv3 + EasyOCR")
+    gr.Markdown("Upload a document image (PNG, JPG, JPEG). We’ll extract the layout and text using EasyOCR.")
     with gr.Row():
         with gr.Column():
+            image_input = gr.Image(type="pil", label="📄 Upload Document Image")
+            submit_button = gr.Button("🔍 Process Document")
         with gr.Column():
+            image_output = gr.Image(label="📷 Uploaded Image")
+            text_output = gr.Textbox(label="📊 Analysis Results", lines=20)
     submit_button.click(
         fn=process_document,
         inputs=image_input,
     )
     gr.Markdown("""
+    ## 📌 Instructions
+    1. Upload a document image.
+    2. Click "Process Document".
+    3. See the text extracted and model output.
     """)
+# Launch
 demo.launch()