Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -19,15 +19,23 @@ reader = easyocr.Reader(['en'])
|
|
19 |
# ----------------- HTML Parsing -----------------
|
20 |
from bs4 import BeautifulSoup
|
21 |
|
|
|
|
|
22 |
def parse_html_to_json(html_file):
|
23 |
-
"""
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
26 |
html_content = html_file.read()
|
27 |
if isinstance(html_content, bytes):
|
28 |
html_content = html_content.decode("utf-8")
|
29 |
-
|
30 |
-
html_content =
|
|
|
|
|
31 |
|
32 |
soup = BeautifulSoup(html_content, "html.parser")
|
33 |
|
@@ -37,17 +45,18 @@ def parse_html_to_json(html_file):
|
|
37 |
line_height = 20
|
38 |
char_width = 10
|
39 |
|
40 |
-
#
|
41 |
body = soup.body
|
42 |
if not body:
|
43 |
-
body = soup # fallback
|
44 |
|
|
|
45 |
for element in body.find_all(text=True):
|
46 |
text = element.strip()
|
47 |
if not text:
|
48 |
continue
|
49 |
|
50 |
-
#
|
51 |
line_words = text.split()
|
52 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
53 |
|
|
|
19 |
# ----------------- HTML Parsing -----------------
|
20 |
from bs4 import BeautifulSoup
|
21 |
|
22 |
+
from bs4 import BeautifulSoup
|
23 |
+
|
24 |
def parse_html_to_json(html_file):
|
25 |
+
"""
|
26 |
+
Parse HTML content from a Gradio file input or string and produce
|
27 |
+
words/paragraphs JSON compatible with image OCR output.
|
28 |
+
"""
|
29 |
+
# Handle Gradio NamedString, str, or file-like object
|
30 |
+
html_content = ""
|
31 |
+
if hasattr(html_file, "read"): # real file
|
32 |
html_content = html_file.read()
|
33 |
if isinstance(html_content, bytes):
|
34 |
html_content = html_content.decode("utf-8")
|
35 |
+
elif isinstance(html_file, str):
|
36 |
+
html_content = html_file
|
37 |
+
else: # Gradio NamedString
|
38 |
+
html_content = getattr(html_file, "name", str(html_file))
|
39 |
|
40 |
soup = BeautifulSoup(html_content, "html.parser")
|
41 |
|
|
|
45 |
line_height = 20
|
46 |
char_width = 10
|
47 |
|
48 |
+
# iterate over all visible text nodes in the body
|
49 |
body = soup.body
|
50 |
if not body:
|
51 |
+
body = soup # fallback
|
52 |
|
53 |
+
# Only consider visible text
|
54 |
for element in body.find_all(text=True):
|
55 |
text = element.strip()
|
56 |
if not text:
|
57 |
continue
|
58 |
|
59 |
+
# split into words
|
60 |
line_words = text.split()
|
61 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
62 |
|