darkbat commited on
Commit
1938ce7
Β·
verified Β·
1 Parent(s): 00491e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -34
app.py CHANGED
@@ -3,57 +3,95 @@ from transformers import AutoModel, AutoProcessor
3
  from PIL import Image
4
  import torch
5
  import json
 
 
6
 
7
- # Load the LayoutLMv3 model and processor
 
 
 
8
  def load_model():
9
- processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True)
10
  model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
11
  return processor, model
12
 
13
  processor, model = load_model()
14
 
15
- # Function to process the uploaded image
16
  def process_document(image):
17
  try:
18
- # Ensure image is a PIL Image (Gradio provides it as PIL with type="pil")
19
  if not isinstance(image, Image.Image):
20
  return None, "Error: Invalid image format. Please upload a valid image."
21
-
22
- # Preprocess the image with the processor
23
- encoding = processor(image, return_tensors="pt")
24
-
25
- # Run the model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  with torch.no_grad():
27
  outputs = model(**encoding)
28
-
29
- # Extract logits or embeddings (modify based on your task)
30
- logits = outputs.logits if hasattr(outputs, 'logits') else outputs.last_hidden_state
31
-
32
- # Placeholder result; customize based on your task (e.g., token classification, text extraction)
33
  result = {
34
  "status": "success",
35
- "model_output_shape": str(logits.shape),
36
- "message": "Document processed successfully. Customize this section for specific outputs."
 
37
  }
38
-
39
  return image, json.dumps(result, indent=2)
40
-
41
  except Exception as e:
42
  return image, f"Error processing document: {str(e)}"
43
 
44
- # Gradio Interface
45
- with gr.Blocks(title="Document Analysis with LayoutLMv3") as demo:
46
- gr.Markdown("# Document Analysis with LayoutLMv3")
47
- gr.Markdown("Upload a document image (PNG, JPG, JPEG) to analyze its layout and extract text.")
48
-
49
  with gr.Row():
50
  with gr.Column():
51
- image_input = gr.Image(type="pil", label="Upload Document Image")
52
- submit_button = gr.Button("Process Document")
53
  with gr.Column():
54
- image_output = gr.Image(label="Uploaded Image")
55
- text_output = gr.Textbox(label="Analysis Results")
56
-
57
  submit_button.click(
58
  fn=process_document,
59
  inputs=image_input,
@@ -61,12 +99,11 @@ with gr.Blocks(title="Document Analysis with LayoutLMv3") as demo:
61
  )
62
 
63
  gr.Markdown("""
64
- ### Instructions
65
- 1. Upload a document image (PNG, JPG, or JPEG).
66
- 2. Click "Process Document" to analyze the image.
67
- 3. View the results in the output section.
68
- 4. This is a basic demo; customize the output processing for specific tasks (e.g., text extraction, layout analysis).
69
  """)
70
 
71
- # Launch the Gradio app
72
  demo.launch()
 
3
  from PIL import Image
4
  import torch
5
  import json
6
+ import easyocr
7
+ import numpy as np
8
 
9
+ # Load EasyOCR Reader
10
+ reader = easyocr.Reader(['en'])
11
+
12
+ # Load LayoutLMv3 model and processor
13
  def load_model():
14
+ processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base")
15
  model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
16
  return processor, model
17
 
18
  processor, model = load_model()
19
 
20
+ # OCR + Preprocessing for LayoutLMv3
21
  def process_document(image):
22
  try:
 
23
  if not isinstance(image, Image.Image):
24
  return None, "Error: Invalid image format. Please upload a valid image."
25
+
26
+ # OCR: Get text and boxes from EasyOCR
27
+ ocr_results = reader.readtext(np.array(image))
28
+
29
+ if not ocr_results:
30
+ return image, "No text detected."
31
+
32
+ words = []
33
+ boxes = []
34
+ for (bbox, text, confidence) in ocr_results:
35
+ if text.strip() == "":
36
+ continue
37
+ words.append(text)
38
+ # Convert bounding box to [x0, y0, x1, y1] format (top-left, bottom-right)
39
+ x_coords = [point[0] for point in bbox]
40
+ y_coords = [point[1] for point in bbox]
41
+ x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords))
42
+ boxes.append([x0, y0, x1, y1])
43
+
44
+ # Normalize boxes to LayoutLMv3 expected format (1000x1000)
45
+ width, height = image.size
46
+ normalized_boxes = []
47
+ for box in boxes:
48
+ x0, y0, x1, y1 = box
49
+ normalized_box = [
50
+ int(1000 * x0 / width),
51
+ int(1000 * y0 / height),
52
+ int(1000 * x1 / width),
53
+ int(1000 * y1 / height)
54
+ ]
55
+ normalized_boxes.append(normalized_box)
56
+
57
+ # Encode inputs for LayoutLMv3
58
+ encoding = processor(image,
59
+ words=words,
60
+ boxes=normalized_boxes,
61
+ return_tensors="pt",
62
+ truncation=True,
63
+ padding="max_length")
64
+
65
  with torch.no_grad():
66
  outputs = model(**encoding)
67
+
68
+ # Use last hidden state or logits based on model
69
+ hidden = outputs.last_hidden_state
 
 
70
  result = {
71
  "status": "success",
72
+ "words": words,
73
+ "model_output_shape": str(hidden.shape),
74
+ "message": "Document processed with EasyOCR and LayoutLMv3."
75
  }
76
+
77
  return image, json.dumps(result, indent=2)
78
+
79
  except Exception as e:
80
  return image, f"Error processing document: {str(e)}"
81
 
82
+ # Gradio UI
83
+ with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo:
84
+ gr.Markdown("# 🧾 Document Layout Analysis with LayoutLMv3 + EasyOCR")
85
+ gr.Markdown("Upload a document image (PNG, JPG, JPEG). We’ll extract the layout and text using EasyOCR.")
86
+
87
  with gr.Row():
88
  with gr.Column():
89
+ image_input = gr.Image(type="pil", label="πŸ“„ Upload Document Image")
90
+ submit_button = gr.Button("πŸ” Process Document")
91
  with gr.Column():
92
+ image_output = gr.Image(label="πŸ“· Uploaded Image")
93
+ text_output = gr.Textbox(label="πŸ“Š Analysis Results", lines=20)
94
+
95
  submit_button.click(
96
  fn=process_document,
97
  inputs=image_input,
 
99
  )
100
 
101
  gr.Markdown("""
102
+ ## πŸ“Œ Instructions
103
+ 1. Upload a document image.
104
+ 2. Click "Process Document".
105
+ 3. See the text extracted and model output.
 
106
  """)
107
 
108
+ # Launch
109
  demo.launch()