rahul7star commited on
Commit
35f0997
·
verified ·
1 Parent(s): 240048f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -18
app.py CHANGED
@@ -23,40 +23,42 @@ from bs4 import BeautifulSoup
23
 
24
  def parse_html_to_json(html_file):
25
  """
26
- Parse HTML content from a Gradio file input or string and produce
27
- words/paragraphs JSON compatible with image OCR output.
28
  """
29
- # Handle Gradio NamedString, str, or file-like object
30
  html_content = ""
31
- if hasattr(html_file, "read"): # real file
32
- html_content = html_file.read()
33
- if isinstance(html_content, bytes):
34
- html_content = html_content.decode("utf-8")
35
- elif isinstance(html_file, str):
36
- html_content = html_file
37
- else: # Gradio NamedString
38
- html_content = getattr(html_file, "name", str(html_file))
39
 
40
- soup = BeautifulSoup(html_content, "html.parser")
 
 
 
 
 
 
 
 
 
 
 
 
41
 
 
42
  words_json = []
43
  paragraphs_json = []
44
  y_offset = 0
45
  line_height = 20
46
  char_width = 10
47
 
48
- # iterate over all visible text nodes in the body
49
  body = soup.body
50
  if not body:
51
- body = soup # fallback
52
 
53
- # Only consider visible text
54
  for element in body.find_all(text=True):
55
  text = element.strip()
56
  if not text:
57
  continue
58
 
59
- # split into words
60
  line_words = text.split()
61
  line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
62
 
@@ -74,14 +76,12 @@ def parse_html_to_json(html_file):
74
  "bbox": line_bbox,
75
  "words": word_entries
76
  })
77
-
78
  y_offset += line_height
79
 
80
  output_json = {
81
  "words": words_json,
82
  "paragraphs": paragraphs_json
83
  }
84
-
85
  return output_json
86
 
87
  # ----------------- Image Loading -----------------
 
23
 
24
  def parse_html_to_json(html_file):
25
  """
26
+ Properly parse HTML file uploaded via Gradio.
27
+ Returns JSON with words and paragraphs like image OCR output.
28
  """
 
29
  html_content = ""
 
 
 
 
 
 
 
 
30
 
31
+ try:
32
+ # Gradio gives a temp file path string for uploaded files
33
+ if isinstance(html_file, str):
34
+ with open(html_file, "r", encoding="utf-8") as f:
35
+ html_content = f.read()
36
+ elif hasattr(html_file, "read"): # file-like object
37
+ html_content = html_file.read()
38
+ if isinstance(html_content, bytes):
39
+ html_content = html_content.decode("utf-8")
40
+ else:
41
+ html_content = str(html_file)
42
+ except Exception as e:
43
+ return {"error": f"Cannot read HTML file: {e}"}
44
 
45
+ soup = BeautifulSoup(html_content, "html.parser")
46
  words_json = []
47
  paragraphs_json = []
48
  y_offset = 0
49
  line_height = 20
50
  char_width = 10
51
 
 
52
  body = soup.body
53
  if not body:
54
+ body = soup
55
 
56
+ # iterate over all visible text nodes
57
  for element in body.find_all(text=True):
58
  text = element.strip()
59
  if not text:
60
  continue
61
 
 
62
  line_words = text.split()
63
  line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
64
 
 
76
  "bbox": line_bbox,
77
  "words": word_entries
78
  })
 
79
  y_offset += line_height
80
 
81
  output_json = {
82
  "words": words_json,
83
  "paragraphs": paragraphs_json
84
  }
 
85
  return output_json
86
 
87
  # ----------------- Image Loading -----------------