rahul7star commited on
Commit
d954be0
·
verified ·
1 Parent(s): ab4c9ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -13
app.py CHANGED
@@ -17,8 +17,11 @@ model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwrit
17
  reader = easyocr.Reader(['en'])
18
 
19
  # ----------------- HTML Parsing -----------------
 
 
20
  def parse_html_to_json(html_file):
21
- """Extract words and paragraphs from HTML in the same structure as image OCR"""
 
22
  if hasattr(html_file, "read"):
23
  html_content = html_file.read()
24
  if isinstance(html_content, bytes):
@@ -34,11 +37,17 @@ def parse_html_to_json(html_file):
34
  line_height = 20
35
  char_width = 10
36
 
37
- for tag in soup.find_all(True): # All tags
38
- text = tag.get_text(separator=' ', strip=True)
 
 
 
 
 
39
  if not text:
40
  continue
41
 
 
42
  line_words = text.split()
43
  line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
44
 
@@ -46,16 +55,9 @@ def parse_html_to_json(html_file):
46
  x_offset = 0
47
  for word in line_words:
48
  word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
49
- word_entries.append({
50
- "text": word,
51
- "bbox": word_bbox,
52
- "confidence": 1.0
53
- })
54
- words_json.append({
55
- "text": word,
56
- "bbox": word_bbox,
57
- "confidence": 1.0
58
- })
59
  x_offset += char_width * (len(word) + 1)
60
 
61
  paragraphs_json.append({
 
17
  reader = easyocr.Reader(['en'])
18
 
19
  # ----------------- HTML Parsing -----------------
20
+ from bs4 import BeautifulSoup
21
+
22
  def parse_html_to_json(html_file):
23
+ """Parse HTML and extract words/paragraphs JSON compatible with image OCR output."""
24
+ # Read content depending on object type
25
  if hasattr(html_file, "read"):
26
  html_content = html_file.read()
27
  if isinstance(html_content, bytes):
 
37
  line_height = 20
38
  char_width = 10
39
 
40
+ # Iterate over all text elements inside the body
41
+ body = soup.body
42
+ if not body:
43
+ body = soup # fallback if <body> missing
44
+
45
+ for element in body.find_all(text=True):
46
+ text = element.strip()
47
  if not text:
48
  continue
49
 
50
+ # Split into words
51
  line_words = text.split()
52
  line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
53
 
 
55
  x_offset = 0
56
  for word in line_words:
57
  word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
58
+ word_entry = {"text": word, "bbox": word_bbox, "confidence": 1.0}
59
+ word_entries.append(word_entry)
60
+ words_json.append(word_entry)
 
 
 
 
 
 
 
61
  x_offset += char_width * (len(word) + 1)
62
 
63
  paragraphs_json.append({