rahul7star commited on
Commit
ab4c9ec
·
verified ·
1 Parent(s): 051d865

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -45
app.py CHANGED
@@ -11,35 +11,14 @@ from bs4 import BeautifulSoup
11
  import base64
12
  import re
13
 
14
- # Initialize OCR models
15
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
16
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
17
  reader = easyocr.Reader(['en'])
18
 
19
- # ----------------- HTML Utilities -----------------
20
- def extract_images_from_html(html_content):
21
- """Extract images from HTML content (base64 or URLs)"""
22
- images = []
23
- soup = BeautifulSoup(html_content, "html.parser")
24
- for img_tag in soup.find_all("img"):
25
- src = img_tag.get("src")
26
- if not src:
27
- continue
28
- if src.startswith("data:image"):
29
- b64_data = re.sub(r"^data:image/.+;base64,", "", src)
30
- image = Image.open(BytesIO(base64.b64decode(b64_data))).convert("RGB")
31
- images.append(image)
32
- else:
33
- try:
34
- response = requests.get(src)
35
- image = Image.open(BytesIO(response.content)).convert("RGB")
36
- images.append(image)
37
- except:
38
- continue
39
- return images
40
-
41
- def parse_html_words(html_file):
42
- """Extract words and lines from HTML with approximate bounding boxes"""
43
  if hasattr(html_file, "read"):
44
  html_content = html_file.read()
45
  if isinstance(html_content, bytes):
@@ -47,18 +26,16 @@ def parse_html_words(html_file):
47
  else:
48
  html_content = str(html_file)
49
 
50
- images_in_html = extract_images_from_html(html_content)
51
-
52
  soup = BeautifulSoup(html_content, "html.parser")
 
53
  words_json = []
54
- lines_json = []
55
  y_offset = 0
56
  line_height = 20
57
  char_width = 10
58
 
59
- # Traverse visible text blocks
60
- for block in soup.find_all(['p', 'div', 'span', 'h1', 'h2', 'h3', 'li']):
61
- text = block.get_text(separator=' ', strip=True)
62
  if not text:
63
  continue
64
 
@@ -71,15 +48,17 @@ def parse_html_words(html_file):
71
  word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
72
  word_entries.append({
73
  "text": word,
74
- "bbox": word_bbox
 
75
  })
76
  words_json.append({
77
  "text": word,
78
- "bbox": word_bbox
 
79
  })
80
  x_offset += char_width * (len(word) + 1)
81
 
82
- lines_json.append({
83
  "text": text,
84
  "bbox": line_bbox,
85
  "words": word_entries
@@ -89,13 +68,12 @@ def parse_html_words(html_file):
89
 
90
  output_json = {
91
  "words": words_json,
92
- "lines": lines_json,
93
- "images_found": len(images_in_html)
94
  }
95
 
96
- return output_json, images_in_html
97
 
98
- # ----------------- Image Utilities -----------------
99
  def load_image(image_file, image_url):
100
  if image_file:
101
  return [image_file]
@@ -104,19 +82,19 @@ def load_image(image_file, image_url):
104
  return [Image.open(BytesIO(response.content)).convert("RGB")]
105
  return []
106
 
107
- # ----------------- Main Combined Logic -----------------
108
  def detect_text_combined(image_file, image_url, html_file):
109
- # HTML path
110
  if html_file:
111
- output_json, images_in_html = parse_html_words(html_file)
112
  json_str = json.dumps(output_json, indent=2)
113
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
114
  tmp_file.write(json_str)
115
  tmp_file.close()
116
- annotated_image = images_in_html[0] if images_in_html else None
117
  return annotated_image, json_str, tmp_file.name
118
 
119
- # Image path
120
  images = load_image(image_file, image_url)
121
  if not images:
122
  return None, "No input provided.", None
@@ -168,12 +146,12 @@ iface = gr.Interface(
168
  gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
169
  ],
170
  outputs=[
171
- gr.Image(type="pil", label="Annotated Image / First Extracted Image for HTML"),
172
  gr.Textbox(label="JSON Output"),
173
  gr.File(label="Download JSON")
174
  ],
175
  title="Combined OCR & HTML Text Bounding Box Extractor",
176
- description="Upload an image, provide an image URL, or upload an HTML file. Outputs word- and line-level bounding boxes in JSON with annotated images for images."
177
  )
178
 
179
  if __name__ == "__main__":
 
11
  import base64
12
  import re
13
 
14
+ # ----------------- Initialize OCR -----------------
15
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
16
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
17
  reader = easyocr.Reader(['en'])
18
 
19
+ # ----------------- HTML Parsing -----------------
20
+ def parse_html_to_json(html_file):
21
+ """Extract words and paragraphs from HTML in the same structure as image OCR"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  if hasattr(html_file, "read"):
23
  html_content = html_file.read()
24
  if isinstance(html_content, bytes):
 
26
  else:
27
  html_content = str(html_file)
28
 
 
 
29
  soup = BeautifulSoup(html_content, "html.parser")
30
+
31
  words_json = []
32
+ paragraphs_json = []
33
  y_offset = 0
34
  line_height = 20
35
  char_width = 10
36
 
37
+ for tag in soup.find_all(True): # All tags
38
+ text = tag.get_text(separator=' ', strip=True)
 
39
  if not text:
40
  continue
41
 
 
48
  word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
49
  word_entries.append({
50
  "text": word,
51
+ "bbox": word_bbox,
52
+ "confidence": 1.0
53
  })
54
  words_json.append({
55
  "text": word,
56
+ "bbox": word_bbox,
57
+ "confidence": 1.0
58
  })
59
  x_offset += char_width * (len(word) + 1)
60
 
61
+ paragraphs_json.append({
62
  "text": text,
63
  "bbox": line_bbox,
64
  "words": word_entries
 
68
 
69
  output_json = {
70
  "words": words_json,
71
+ "paragraphs": paragraphs_json
 
72
  }
73
 
74
+ return output_json
75
 
76
+ # ----------------- Image Loading -----------------
77
  def load_image(image_file, image_url):
78
  if image_file:
79
  return [image_file]
 
82
  return [Image.open(BytesIO(response.content)).convert("RGB")]
83
  return []
84
 
85
+ # ----------------- Main Logic -----------------
86
  def detect_text_combined(image_file, image_url, html_file):
87
+ # ----------------- HTML Path -----------------
88
  if html_file:
89
+ output_json = parse_html_to_json(html_file)
90
  json_str = json.dumps(output_json, indent=2)
91
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
92
  tmp_file.write(json_str)
93
  tmp_file.close()
94
+ annotated_image = None
95
  return annotated_image, json_str, tmp_file.name
96
 
97
+ # ----------------- Image Path -----------------
98
  images = load_image(image_file, image_url)
99
  if not images:
100
  return None, "No input provided.", None
 
146
  gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
147
  ],
148
  outputs=[
149
+ gr.Image(type="pil", label="Annotated Image"),
150
  gr.Textbox(label="JSON Output"),
151
  gr.File(label="Download JSON")
152
  ],
153
  title="Combined OCR & HTML Text Bounding Box Extractor",
154
+ description="Upload an image, provide an image URL, or upload an HTML file. Outputs word- and paragraph-level bounding boxes in JSON format consistent with image OCR output."
155
  )
156
 
157
  if __name__ == "__main__":