Spaces:

rahul7star
/

OCR

Sleeping

rahul7star commited on 11 days ago

Commit

240048f

verified ·

1 Parent(s): d954be0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,15 +19,23 @@ reader = easyocr.Reader(['en'])
 # ----------------- HTML Parsing -----------------
 from bs4 import BeautifulSoup
 def parse_html_to_json(html_file):
-    """Parse HTML and extract words/paragraphs JSON compatible with image OCR output."""
-    # Read content depending on object type
-    if hasattr(html_file, "read"):
         html_content = html_file.read()
         if isinstance(html_content, bytes):
             html_content = html_content.decode("utf-8")
-    else:
-        html_content = str(html_file)
     soup = BeautifulSoup(html_content, "html.parser")
@@ -37,17 +45,18 @@ def parse_html_to_json(html_file):
     line_height = 20
     char_width = 10
-    # Iterate over all text elements inside the body
     body = soup.body
     if not body:
-        body = soup  # fallback if <body> missing
     for element in body.find_all(text=True):
         text = element.strip()
         if not text:
             continue
-        # Split into words
         line_words = text.split()
         line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]

 # ----------------- HTML Parsing -----------------
 from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup
 def parse_html_to_json(html_file):
+    """
+    Parse HTML content from a Gradio file input or string and produce
+    words/paragraphs JSON compatible with image OCR output.
+    """
+    # Handle Gradio NamedString, str, or file-like object
+    html_content = ""
+    if hasattr(html_file, "read"):  # real file
         html_content = html_file.read()
         if isinstance(html_content, bytes):
             html_content = html_content.decode("utf-8")
+    elif isinstance(html_file, str):
+        html_content = html_file
+    else:  # Gradio NamedString
+        html_content = getattr(html_file, "name", str(html_file))
     soup = BeautifulSoup(html_content, "html.parser")
     line_height = 20
     char_width = 10
+    # iterate over all visible text nodes in the body
     body = soup.body
     if not body:
+        body = soup  # fallback
+    # Only consider visible text
     for element in body.find_all(text=True):
         text = element.strip()
         if not text:
             continue
+        # split into words
         line_words = text.split()
         line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]