rahul7star commited on
Commit
0752ecf
·
verified ·
1 Parent(s): 755bf15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -27
app.py CHANGED
@@ -1,59 +1,81 @@
1
- import gradio as gr
 
2
  from PIL import Image, ImageDraw
 
 
3
  import requests
4
  from io import BytesIO
5
- from transformers import TrOCRProcessor, VisionEncoderDecoderModel
6
 
7
- # Load OCR model
8
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
9
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
10
 
 
 
 
11
  def load_image(image_file, image_url):
12
- """
13
- Load image from file or URL.
14
- """
15
  if image_file:
16
  return image_file
17
  elif image_url:
18
  response = requests.get(image_url)
19
  return Image.open(BytesIO(response.content)).convert("RGB")
20
- else:
21
- return None
22
 
23
- def detect_text(image_file, image_url):
24
- """
25
- Detect text in an image and return annotated image + text coordinates.
26
- """
27
  image = load_image(image_file, image_url)
28
  if image is None:
29
- return None, "No image provided."
30
 
31
- # Use the OCR processor to get pixel-level data
32
- pixel_values = processor(images=image, return_tensors="pt").pixel_values
33
- generated_ids = model.generate(pixel_values)
34
- text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
35
 
36
- # For demonstration: bounding box around the full image (TroCR doesn't return coordinates)
37
- # For proper coordinates use an OCR model like PaddleOCR or EasyOCR
38
  draw = ImageDraw.Draw(image)
39
- w, h = image.size
40
- draw.rectangle([0, 0, w, h], outline="red", width=3)
41
- coords_str = f"Full image bounding box: [0,0,{w},{h}]\nDetected text: {text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- return image, coords_str
44
 
45
  iface = gr.Interface(
46
- fn=detect_text,
47
  inputs=[
48
  gr.Image(type="pil", label="Upload Image"),
49
  gr.Textbox(label="Image URL (optional)")
50
  ],
51
  outputs=[
52
  gr.Image(type="pil", label="Annotated Image"),
53
- gr.Textbox(label="Detected Text & Coordinates")
 
54
  ],
55
- title="Text Detection from Image",
56
- description="Upload an image or enter an image URL, and the app will detect text and show bounding boxes."
57
  )
58
 
59
  if __name__ == "__main__":
 
1
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
2
+ import easyocr
3
  from PIL import Image, ImageDraw
4
+ import numpy as np
5
+ import gradio as gr
6
  import requests
7
  from io import BytesIO
8
+ import json
9
 
10
+ # TrOCR model for recognition
11
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
12
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
13
 
14
+ # EasyOCR reader for bounding boxes
15
+ reader = easyocr.Reader(['en'])
16
+
17
  def load_image(image_file, image_url):
 
 
 
18
  if image_file:
19
  return image_file
20
  elif image_url:
21
  response = requests.get(image_url)
22
  return Image.open(BytesIO(response.content)).convert("RGB")
23
+ return None
 
24
 
25
+ def detect_text_trocr_json(image_file, image_url):
 
 
 
26
  image = load_image(image_file, image_url)
27
  if image is None:
28
+ return None, "No image provided.", None
29
 
30
+ # Step 1: Detect bounding boxes with EasyOCR
31
+ results = reader.readtext(np.array(image))
 
 
32
 
 
 
33
  draw = ImageDraw.Draw(image)
34
+ words_json = []
35
+ paragraph_json = []
36
+
37
+ for bbox, _, conf in results:
38
+ x_coords = [point[0] for point in bbox]
39
+ y_coords = [point[1] for point in bbox]
40
+ x_min, y_min = min(x_coords), min(y_coords)
41
+ x_max, y_max = max(x_coords), max(y_coords)
42
+
43
+ # Crop each word for recognition
44
+ word_crop = image.crop((x_min, y_min, x_max, y_max))
45
+ pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
46
+ generated_ids = model.generate(pixel_values)
47
+ text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
48
+
49
+ draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
50
+
51
+ words_json.append({
52
+ "text": text,
53
+ "bbox": [x_min, y_min, x_max, y_max],
54
+ "confidence": float(conf)
55
+ })
56
+
57
+ paragraph_json = words_json.copy()
58
+
59
+ output_json = {
60
+ "words": words_json,
61
+ "paragraphs": paragraph_json
62
+ }
63
 
64
+ return image, json.dumps(output_json, indent=2), json.dumps(output_json)
65
 
66
  iface = gr.Interface(
67
+ fn=detect_text_trocr_json,
68
  inputs=[
69
  gr.Image(type="pil", label="Upload Image"),
70
  gr.Textbox(label="Image URL (optional)")
71
  ],
72
  outputs=[
73
  gr.Image(type="pil", label="Annotated Image"),
74
+ gr.Textbox(label="Text & Bounding Boxes (JSON)"),
75
+ gr.File(label="Download JSON")
76
  ],
77
+ title="Handwritten OCR with TrOCR + Bounding Boxes",
78
+ description="Detect handwritten text and bounding boxes. Uses TrOCR for recognition and EasyOCR for detection."
79
  )
80
 
81
  if __name__ == "__main__":