Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,8 +17,11 @@ model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwrit
|
|
17 |
reader = easyocr.Reader(['en'])
|
18 |
|
19 |
# ----------------- HTML Parsing -----------------
|
|
|
|
|
20 |
def parse_html_to_json(html_file):
|
21 |
-
"""
|
|
|
22 |
if hasattr(html_file, "read"):
|
23 |
html_content = html_file.read()
|
24 |
if isinstance(html_content, bytes):
|
@@ -34,11 +37,17 @@ def parse_html_to_json(html_file):
|
|
34 |
line_height = 20
|
35 |
char_width = 10
|
36 |
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
39 |
if not text:
|
40 |
continue
|
41 |
|
|
|
42 |
line_words = text.split()
|
43 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
44 |
|
@@ -46,16 +55,9 @@ def parse_html_to_json(html_file):
|
|
46 |
x_offset = 0
|
47 |
for word in line_words:
|
48 |
word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
"confidence": 1.0
|
53 |
-
})
|
54 |
-
words_json.append({
|
55 |
-
"text": word,
|
56 |
-
"bbox": word_bbox,
|
57 |
-
"confidence": 1.0
|
58 |
-
})
|
59 |
x_offset += char_width * (len(word) + 1)
|
60 |
|
61 |
paragraphs_json.append({
|
|
|
17 |
reader = easyocr.Reader(['en'])
|
18 |
|
19 |
# ----------------- HTML Parsing -----------------
|
20 |
+
from bs4 import BeautifulSoup
|
21 |
+
|
22 |
def parse_html_to_json(html_file):
|
23 |
+
"""Parse HTML and extract words/paragraphs JSON compatible with image OCR output."""
|
24 |
+
# Read content depending on object type
|
25 |
if hasattr(html_file, "read"):
|
26 |
html_content = html_file.read()
|
27 |
if isinstance(html_content, bytes):
|
|
|
37 |
line_height = 20
|
38 |
char_width = 10
|
39 |
|
40 |
+
# Iterate over all text elements inside the body
|
41 |
+
body = soup.body
|
42 |
+
if not body:
|
43 |
+
body = soup # fallback if <body> missing
|
44 |
+
|
45 |
+
for element in body.find_all(text=True):
|
46 |
+
text = element.strip()
|
47 |
if not text:
|
48 |
continue
|
49 |
|
50 |
+
# Split into words
|
51 |
line_words = text.split()
|
52 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
53 |
|
|
|
55 |
x_offset = 0
|
56 |
for word in line_words:
|
57 |
word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
|
58 |
+
word_entry = {"text": word, "bbox": word_bbox, "confidence": 1.0}
|
59 |
+
word_entries.append(word_entry)
|
60 |
+
words_json.append(word_entry)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
x_offset += char_width * (len(word) + 1)
|
62 |
|
63 |
paragraphs_json.append({
|