Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -23,40 +23,42 @@ from bs4 import BeautifulSoup
|
|
23 |
|
24 |
def parse_html_to_json(html_file):
|
25 |
"""
|
26 |
-
|
27 |
-
|
28 |
"""
|
29 |
-
# Handle Gradio NamedString, str, or file-like object
|
30 |
html_content = ""
|
31 |
-
if hasattr(html_file, "read"): # real file
|
32 |
-
html_content = html_file.read()
|
33 |
-
if isinstance(html_content, bytes):
|
34 |
-
html_content = html_content.decode("utf-8")
|
35 |
-
elif isinstance(html_file, str):
|
36 |
-
html_content = html_file
|
37 |
-
else: # Gradio NamedString
|
38 |
-
html_content = getattr(html_file, "name", str(html_file))
|
39 |
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
|
|
42 |
words_json = []
|
43 |
paragraphs_json = []
|
44 |
y_offset = 0
|
45 |
line_height = 20
|
46 |
char_width = 10
|
47 |
|
48 |
-
# iterate over all visible text nodes in the body
|
49 |
body = soup.body
|
50 |
if not body:
|
51 |
-
body = soup
|
52 |
|
53 |
-
#
|
54 |
for element in body.find_all(text=True):
|
55 |
text = element.strip()
|
56 |
if not text:
|
57 |
continue
|
58 |
|
59 |
-
# split into words
|
60 |
line_words = text.split()
|
61 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
62 |
|
@@ -74,14 +76,12 @@ def parse_html_to_json(html_file):
|
|
74 |
"bbox": line_bbox,
|
75 |
"words": word_entries
|
76 |
})
|
77 |
-
|
78 |
y_offset += line_height
|
79 |
|
80 |
output_json = {
|
81 |
"words": words_json,
|
82 |
"paragraphs": paragraphs_json
|
83 |
}
|
84 |
-
|
85 |
return output_json
|
86 |
|
87 |
# ----------------- Image Loading -----------------
|
|
|
23 |
|
24 |
def parse_html_to_json(html_file):
|
25 |
"""
|
26 |
+
Properly parse HTML file uploaded via Gradio.
|
27 |
+
Returns JSON with words and paragraphs like image OCR output.
|
28 |
"""
|
|
|
29 |
html_content = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
try:
|
32 |
+
# Gradio gives a temp file path string for uploaded files
|
33 |
+
if isinstance(html_file, str):
|
34 |
+
with open(html_file, "r", encoding="utf-8") as f:
|
35 |
+
html_content = f.read()
|
36 |
+
elif hasattr(html_file, "read"): # file-like object
|
37 |
+
html_content = html_file.read()
|
38 |
+
if isinstance(html_content, bytes):
|
39 |
+
html_content = html_content.decode("utf-8")
|
40 |
+
else:
|
41 |
+
html_content = str(html_file)
|
42 |
+
except Exception as e:
|
43 |
+
return {"error": f"Cannot read HTML file: {e}"}
|
44 |
|
45 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
46 |
words_json = []
|
47 |
paragraphs_json = []
|
48 |
y_offset = 0
|
49 |
line_height = 20
|
50 |
char_width = 10
|
51 |
|
|
|
52 |
body = soup.body
|
53 |
if not body:
|
54 |
+
body = soup
|
55 |
|
56 |
+
# iterate over all visible text nodes
|
57 |
for element in body.find_all(text=True):
|
58 |
text = element.strip()
|
59 |
if not text:
|
60 |
continue
|
61 |
|
|
|
62 |
line_words = text.split()
|
63 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
64 |
|
|
|
76 |
"bbox": line_bbox,
|
77 |
"words": word_entries
|
78 |
})
|
|
|
79 |
y_offset += line_height
|
80 |
|
81 |
output_json = {
|
82 |
"words": words_json,
|
83 |
"paragraphs": paragraphs_json
|
84 |
}
|
|
|
85 |
return output_json
|
86 |
|
87 |
# ----------------- Image Loading -----------------
|