rahul7star commited on
Commit
dc023a9
·
verified ·
1 Parent(s): e9b8d71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -53
app.py CHANGED
@@ -7,82 +7,164 @@ import json
7
  import tempfile
8
  import easyocr
9
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 
 
 
10
 
11
- # TrOCR model for recognition
12
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
13
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
14
-
15
- # EasyOCR reader for bounding boxes
16
  reader = easyocr.Reader(['en'])
17
 
18
- def load_image(image_file, image_url):
19
- if image_file:
20
- return image_file
21
- elif image_url:
22
- response = requests.get(image_url)
23
- return Image.open(BytesIO(response.content)).convert("RGB")
24
- return None
25
-
26
- def detect_text_trocr_json(image_file, image_url):
27
- image = load_image(image_file, image_url)
28
- if image is None:
29
- return None, "No image provided.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- results = reader.readtext(np.array(image))
32
- draw = ImageDraw.Draw(image)
33
  words_json = []
34
-
35
- for bbox, _, conf in results:
36
- # Convert coordinates to float for JSON serialization
37
- x_coords = [float(point[0]) for point in bbox]
38
- y_coords = [float(point[1]) for point in bbox]
39
- x_min, y_min = min(x_coords), min(y_coords)
40
- x_max, y_max = max(x_coords), max(y_coords)
41
-
42
- # Crop each word for recognition
43
- word_crop = image.crop((x_min, y_min, x_max, y_max))
44
- pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
45
- generated_ids = model.generate(pixel_values)
46
- text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
47
-
48
- draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
49
-
50
- words_json.append({
51
- "text": text,
52
- "bbox": [x_min, y_min, x_max, y_max],
53
- "confidence": float(conf)
 
 
 
 
 
 
 
 
54
  })
55
 
56
- # Treat words as paragraphs for simplicity
57
- paragraphs_json = words_json.copy()
58
 
59
  output_json = {
60
  "words": words_json,
61
- "paragraphs": paragraphs_json
62
  }
63
 
64
- json_str = json.dumps(output_json, indent=2)
65
-
66
- # Save JSON to a temporary file for Gradio download
67
- tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w')
68
- tmp_file.write(json_str)
69
- tmp_file.close()
70
 
71
- return image, json_str, tmp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  iface = gr.Interface(
74
- fn=detect_text_trocr_json,
75
  inputs=[
76
  gr.Image(type="pil", label="Upload Image"),
77
- gr.Textbox(label="Image URL (optional)")
 
78
  ],
79
  outputs=[
80
- gr.Image(type="pil", label="Annotated Image"),
81
- gr.Textbox(label="Text & Bounding Boxes (JSON)"),
82
  gr.File(label="Download JSON")
83
  ],
84
- title="Handwritten OCR with TrOCR + Bounding Boxes",
85
- description="Detect handwritten text and bounding boxes. Uses TrOCR for recognition and EasyOCR for detection."
86
  )
87
 
88
  if __name__ == "__main__":
 
7
  import tempfile
8
  import easyocr
9
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
10
+ from bs4 import BeautifulSoup
11
+ import base64
12
+ import re
13
 
14
+ # Initialize OCR models
15
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
16
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 
 
17
  reader = easyocr.Reader(['en'])
18
 
19
+ def extract_images_from_html(html_file):
20
+ """Extract images from HTML file (base64 or URLs)"""
21
+ images = []
22
+ soup = BeautifulSoup(html_file.read(), "html.parser")
23
+ for img_tag in soup.find_all("img"):
24
+ src = img_tag.get("src")
25
+ if not src:
26
+ continue
27
+ if src.startswith("data:image"):
28
+ b64_data = re.sub(r"^data:image/.+;base64,", "", src)
29
+ image = Image.open(BytesIO(base64.b64decode(b64_data))).convert("RGB")
30
+ images.append(image)
31
+ else:
32
+ try:
33
+ response = requests.get(src)
34
+ image = Image.open(BytesIO(response.content)).convert("RGB")
35
+ images.append(image)
36
+ except:
37
+ continue
38
+ return images
39
+
40
+ def parse_html_text(html_file):
41
+ """Parse HTML text and generate approximate bounding boxes"""
42
+ html_content = html_file.read().decode("utf-8")
43
+ soup = BeautifulSoup(html_content, "html.parser")
44
+ body_text = soup.get_text(separator="\n")
45
+ lines = [line.strip() for line in body_text.split("\n") if line.strip()]
46
 
 
 
47
  words_json = []
48
+ lines_json = []
49
+
50
+ y_offset = 0
51
+ line_height = 20
52
+ char_width = 10
53
+
54
+ for line in lines:
55
+ line_words = line.split()
56
+ line_bbox = [0, y_offset, char_width * len(line), y_offset + line_height]
57
+
58
+ word_entries = []
59
+ x_offset = 0
60
+ for word in line_words:
61
+ word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
62
+ word_entries.append({
63
+ "text": word,
64
+ "bbox": word_bbox
65
+ })
66
+ words_json.append({
67
+ "text": word,
68
+ "bbox": word_bbox
69
+ })
70
+ x_offset += char_width * (len(word) + 1)
71
+
72
+ lines_json.append({
73
+ "text": line,
74
+ "bbox": line_bbox,
75
+ "words": word_entries
76
  })
77
 
78
+ y_offset += line_height
 
79
 
80
  output_json = {
81
  "words": words_json,
82
+ "lines": lines_json
83
  }
84
 
85
+ return html_content, output_json
 
 
 
 
 
86
 
87
+ def load_image(image_file, image_url):
88
+ if image_file:
89
+ return [image_file]
90
+ elif image_url:
91
+ response = requests.get(image_url)
92
+ return [Image.open(BytesIO(response.content)).convert("RGB")]
93
+ return []
94
+
95
+ def detect_text_combined(image_file, image_url, html_file):
96
+ # HTML path
97
+ if html_file:
98
+ html_content, output_json = parse_html_text(html_file)
99
+ json_str = json.dumps(output_json, indent=2)
100
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
101
+ tmp_file.write(json_str)
102
+ tmp_file.close()
103
+ return html_content, json_str, tmp_file.name
104
+
105
+ # Image path
106
+ images = load_image(image_file, image_url)
107
+ if not images:
108
+ return None, "No input provided.", None
109
+
110
+ all_output_json = []
111
+ annotated_images = []
112
+
113
+ for image in images:
114
+ results = reader.readtext(np.array(image))
115
+ draw = ImageDraw.Draw(image)
116
+ words_json = []
117
+
118
+ for bbox, _, conf in results:
119
+ x_coords = [float(point[0]) for point in bbox]
120
+ y_coords = [float(point[1]) for point in bbox]
121
+ x_min, y_min = min(x_coords), min(y_coords)
122
+ x_max, y_max = max(x_coords), max(y_coords)
123
+
124
+ # Crop word for TrOCR recognition
125
+ word_crop = image.crop((x_min, y_min, x_max, y_max))
126
+ pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
127
+ generated_ids = model.generate(pixel_values)
128
+ text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
129
+
130
+ draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
131
+
132
+ words_json.append({
133
+ "text": text,
134
+ "bbox": [x_min, y_min, x_max, y_max],
135
+ "confidence": float(conf)
136
+ })
137
+
138
+ paragraphs_json = words_json.copy()
139
+ output_json = {
140
+ "words": words_json,
141
+ "paragraphs": paragraphs_json
142
+ }
143
+ json_str = json.dumps(output_json, indent=2)
144
+
145
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
146
+ tmp_file.write(json_str)
147
+ tmp_file.close()
148
+
149
+ annotated_images.append((image, json_str, tmp_file.name))
150
+
151
+ # Return first image for simplicity (can extend to gallery)
152
+ return annotated_images[0]
153
 
154
  iface = gr.Interface(
155
+ fn=detect_text_combined,
156
  inputs=[
157
  gr.Image(type="pil", label="Upload Image"),
158
+ gr.Textbox(label="Image URL (optional)"),
159
+ gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
160
  ],
161
  outputs=[
162
+ gr.Image(type="pil", label="Annotated Image / N/A for HTML"),
163
+ gr.Textbox(label="JSON Output"),
164
  gr.File(label="Download JSON")
165
  ],
166
+ title="Combined OCR & HTML Text Bounding Box Extractor",
167
+ description="Upload an image, provide an image URL, or upload an HTML file. Outputs word- and line-level bounding boxes in JSON with annotated images for images."
168
  )
169
 
170
  if __name__ == "__main__":