rahul7star commited on
Commit
240048f
·
verified ·
1 Parent(s): d954be0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -8
app.py CHANGED
@@ -19,15 +19,23 @@ reader = easyocr.Reader(['en'])
19
  # ----------------- HTML Parsing -----------------
20
  from bs4 import BeautifulSoup
21
 
 
 
22
  def parse_html_to_json(html_file):
23
- """Parse HTML and extract words/paragraphs JSON compatible with image OCR output."""
24
- # Read content depending on object type
25
- if hasattr(html_file, "read"):
 
 
 
 
26
  html_content = html_file.read()
27
  if isinstance(html_content, bytes):
28
  html_content = html_content.decode("utf-8")
29
- else:
30
- html_content = str(html_file)
 
 
31
 
32
  soup = BeautifulSoup(html_content, "html.parser")
33
 
@@ -37,17 +45,18 @@ def parse_html_to_json(html_file):
37
  line_height = 20
38
  char_width = 10
39
 
40
- # Iterate over all text elements inside the body
41
  body = soup.body
42
  if not body:
43
- body = soup # fallback if <body> missing
44
 
 
45
  for element in body.find_all(text=True):
46
  text = element.strip()
47
  if not text:
48
  continue
49
 
50
- # Split into words
51
  line_words = text.split()
52
  line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
53
 
 
19
  # ----------------- HTML Parsing -----------------
20
  from bs4 import BeautifulSoup
21
 
22
+ from bs4 import BeautifulSoup
23
+
24
  def parse_html_to_json(html_file):
25
+ """
26
+ Parse HTML content from a Gradio file input or string and produce
27
+ words/paragraphs JSON compatible with image OCR output.
28
+ """
29
+ # Handle Gradio NamedString, str, or file-like object
30
+ html_content = ""
31
+ if hasattr(html_file, "read"): # real file
32
  html_content = html_file.read()
33
  if isinstance(html_content, bytes):
34
  html_content = html_content.decode("utf-8")
35
+ elif isinstance(html_file, str):
36
+ html_content = html_file
37
+ else: # Gradio NamedString
38
+ html_content = getattr(html_file, "name", str(html_file))
39
 
40
  soup = BeautifulSoup(html_content, "html.parser")
41
 
 
45
  line_height = 20
46
  char_width = 10
47
 
48
+ # iterate over all visible text nodes in the body
49
  body = soup.body
50
  if not body:
51
+ body = soup # fallback
52
 
53
+ # Only consider visible text
54
  for element in body.find_all(text=True):
55
  text = element.strip()
56
  if not text:
57
  continue
58
 
59
+ # split into words
60
  line_words = text.split()
61
  line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
62