rahul7star commited on
Commit
051d865
·
verified ·
1 Parent(s): eeb0aa9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -20
app.py CHANGED
@@ -16,6 +16,7 @@ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
16
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
17
  reader = easyocr.Reader(['en'])
18
 
 
19
  def extract_images_from_html(html_content):
20
  """Extract images from HTML content (base64 or URLs)"""
21
  images = []
@@ -37,34 +38,32 @@ def extract_images_from_html(html_content):
37
  continue
38
  return images
39
 
40
- def parse_html_text(html_file):
41
- """Parse HTML text and generate approximate bounding boxes"""
42
- # Handle different Gradio file types
43
  if hasattr(html_file, "read"):
44
  html_content = html_file.read()
45
  if isinstance(html_content, bytes):
46
  html_content = html_content.decode("utf-8")
47
  else:
48
- # NamedString object (Gradio v3.40+)
49
  html_content = str(html_file)
50
 
51
- # Extract images from HTML (optional, for OCR later)
52
  images_in_html = extract_images_from_html(html_content)
53
 
54
  soup = BeautifulSoup(html_content, "html.parser")
55
- body_text = soup.get_text(separator="\n")
56
- lines = [line.strip() for line in body_text.split("\n") if line.strip()]
57
-
58
  words_json = []
59
  lines_json = []
60
-
61
  y_offset = 0
62
  line_height = 20
63
  char_width = 10
64
 
65
- for line in lines:
66
- line_words = line.split()
67
- line_bbox = [0, y_offset, char_width * len(line), y_offset + line_height]
 
 
 
 
 
68
 
69
  word_entries = []
70
  x_offset = 0
@@ -81,7 +80,7 @@ def parse_html_text(html_file):
81
  x_offset += char_width * (len(word) + 1)
82
 
83
  lines_json.append({
84
- "text": line,
85
  "bbox": line_bbox,
86
  "words": word_entries
87
  })
@@ -94,8 +93,9 @@ def parse_html_text(html_file):
94
  "images_found": len(images_in_html)
95
  }
96
 
97
- return html_content, output_json, images_in_html
98
 
 
99
  def load_image(image_file, image_url):
100
  if image_file:
101
  return [image_file]
@@ -104,18 +104,16 @@ def load_image(image_file, image_url):
104
  return [Image.open(BytesIO(response.content)).convert("RGB")]
105
  return []
106
 
 
107
  def detect_text_combined(image_file, image_url, html_file):
108
  # HTML path
109
  if html_file:
110
- html_content, output_json, images_in_html = parse_html_text(html_file)
111
  json_str = json.dumps(output_json, indent=2)
112
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
113
  tmp_file.write(json_str)
114
  tmp_file.close()
115
- annotated_image = None
116
- if images_in_html:
117
- # For demo, show first extracted image if exists
118
- annotated_image = images_in_html[0]
119
  return annotated_image, json_str, tmp_file.name
120
 
121
  # Image path
@@ -161,6 +159,7 @@ def detect_text_combined(image_file, image_url, html_file):
161
 
162
  return annotated_image, json_str, tmp_file.name
163
 
 
164
  iface = gr.Interface(
165
  fn=detect_text_combined,
166
  inputs=[
@@ -169,7 +168,7 @@ iface = gr.Interface(
169
  gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
170
  ],
171
  outputs=[
172
- gr.Image(type="pil", label="Annotated Image / N/A for HTML"),
173
  gr.Textbox(label="JSON Output"),
174
  gr.File(label="Download JSON")
175
  ],
 
16
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
17
  reader = easyocr.Reader(['en'])
18
 
19
+ # ----------------- HTML Utilities -----------------
20
  def extract_images_from_html(html_content):
21
  """Extract images from HTML content (base64 or URLs)"""
22
  images = []
 
38
  continue
39
  return images
40
 
41
+ def parse_html_words(html_file):
42
+ """Extract words and lines from HTML with approximate bounding boxes"""
 
43
  if hasattr(html_file, "read"):
44
  html_content = html_file.read()
45
  if isinstance(html_content, bytes):
46
  html_content = html_content.decode("utf-8")
47
  else:
 
48
  html_content = str(html_file)
49
 
 
50
  images_in_html = extract_images_from_html(html_content)
51
 
52
  soup = BeautifulSoup(html_content, "html.parser")
 
 
 
53
  words_json = []
54
  lines_json = []
 
55
  y_offset = 0
56
  line_height = 20
57
  char_width = 10
58
 
59
+ # Traverse visible text blocks
60
+ for block in soup.find_all(['p', 'div', 'span', 'h1', 'h2', 'h3', 'li']):
61
+ text = block.get_text(separator=' ', strip=True)
62
+ if not text:
63
+ continue
64
+
65
+ line_words = text.split()
66
+ line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
67
 
68
  word_entries = []
69
  x_offset = 0
 
80
  x_offset += char_width * (len(word) + 1)
81
 
82
  lines_json.append({
83
+ "text": text,
84
  "bbox": line_bbox,
85
  "words": word_entries
86
  })
 
93
  "images_found": len(images_in_html)
94
  }
95
 
96
+ return output_json, images_in_html
97
 
98
+ # ----------------- Image Utilities -----------------
99
  def load_image(image_file, image_url):
100
  if image_file:
101
  return [image_file]
 
104
  return [Image.open(BytesIO(response.content)).convert("RGB")]
105
  return []
106
 
107
+ # ----------------- Main Combined Logic -----------------
108
  def detect_text_combined(image_file, image_url, html_file):
109
  # HTML path
110
  if html_file:
111
+ output_json, images_in_html = parse_html_words(html_file)
112
  json_str = json.dumps(output_json, indent=2)
113
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
114
  tmp_file.write(json_str)
115
  tmp_file.close()
116
+ annotated_image = images_in_html[0] if images_in_html else None
 
 
 
117
  return annotated_image, json_str, tmp_file.name
118
 
119
  # Image path
 
159
 
160
  return annotated_image, json_str, tmp_file.name
161
 
162
+ # ----------------- Gradio Interface -----------------
163
  iface = gr.Interface(
164
  fn=detect_text_combined,
165
  inputs=[
 
168
  gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
169
  ],
170
  outputs=[
171
+ gr.Image(type="pil", label="Annotated Image / First Extracted Image for HTML"),
172
  gr.Textbox(label="JSON Output"),
173
  gr.File(label="Download JSON")
174
  ],