Spaces:

rahul7star
/

OCR

Sleeping

rahul7star commited on 17 days ago

Commit

d954be0

verified ·

1 Parent(s): ab4c9ec

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,8 +17,11 @@ model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwrit
 reader = easyocr.Reader(['en'])
 # ----------------- HTML Parsing -----------------
 def parse_html_to_json(html_file):
-    """Extract words and paragraphs from HTML in the same structure as image OCR"""
     if hasattr(html_file, "read"):
         html_content = html_file.read()
         if isinstance(html_content, bytes):
@@ -34,11 +37,17 @@ def parse_html_to_json(html_file):
     line_height = 20
     char_width = 10
-    for tag in soup.find_all(True):  # All tags
-        text = tag.get_text(separator=' ', strip=True)
         if not text:
             continue
         line_words = text.split()
         line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
@@ -46,16 +55,9 @@ def parse_html_to_json(html_file):
         x_offset = 0
         for word in line_words:
             word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
-            word_entries.append({
-                "text": word,
-                "bbox": word_bbox,
-                "confidence": 1.0
-            })
-            words_json.append({
-                "text": word,
-                "bbox": word_bbox,
-                "confidence": 1.0
-            })
             x_offset += char_width * (len(word) + 1)
         paragraphs_json.append({

 reader = easyocr.Reader(['en'])
 # ----------------- HTML Parsing -----------------
+from bs4 import BeautifulSoup
 def parse_html_to_json(html_file):
+    """Parse HTML and extract words/paragraphs JSON compatible with image OCR output."""
+    # Read content depending on object type
     if hasattr(html_file, "read"):
         html_content = html_file.read()
         if isinstance(html_content, bytes):
     line_height = 20
     char_width = 10
+    # Iterate over all text elements inside the body
+    body = soup.body
+    if not body:
+        body = soup  # fallback if <body> missing
+    for element in body.find_all(text=True):
+        text = element.strip()
         if not text:
             continue
+        # Split into words
         line_words = text.split()
         line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
         x_offset = 0
         for word in line_words:
             word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
+            word_entry = {"text": word, "bbox": word_bbox, "confidence": 1.0}
+            word_entries.append(word_entry)
+            words_json.append(word_entry)
             x_offset += char_width * (len(word) + 1)
         paragraphs_json.append({