Spaces:

rahul7star
/

OCR

Sleeping

App Files Files Community

rahul7star commited on 17 days ago

Commit

ab4c9ec

verified ·

1 Parent(s): 051d865

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -45

app.py CHANGED Viewed

@@ -11,35 +11,14 @@ from bs4 import BeautifulSoup
 import base64
 import re
-# Initialize OCR models
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 reader = easyocr.Reader(['en'])
-# ----------------- HTML Utilities -----------------
-def extract_images_from_html(html_content):
-    """Extract images from HTML content (base64 or URLs)"""
-    images = []
-    soup = BeautifulSoup(html_content, "html.parser")
-    for img_tag in soup.find_all("img"):
-        src = img_tag.get("src")
-        if not src:
-            continue
-        if src.startswith("data:image"):
-            b64_data = re.sub(r"^data:image/.+;base64,", "", src)
-            image = Image.open(BytesIO(base64.b64decode(b64_data))).convert("RGB")
-            images.append(image)
-        else:
-            try:
-                response = requests.get(src)
-                image = Image.open(BytesIO(response.content)).convert("RGB")
-                images.append(image)
-            except:
-                continue
-    return images
-def parse_html_words(html_file):
-    """Extract words and lines from HTML with approximate bounding boxes"""
     if hasattr(html_file, "read"):
         html_content = html_file.read()
         if isinstance(html_content, bytes):
@@ -47,18 +26,16 @@ def parse_html_words(html_file):
     else:
         html_content = str(html_file)
-    images_in_html = extract_images_from_html(html_content)
     soup = BeautifulSoup(html_content, "html.parser")
     words_json = []
-    lines_json = []
     y_offset = 0
     line_height = 20
     char_width = 10
-    # Traverse visible text blocks
-    for block in soup.find_all(['p', 'div', 'span', 'h1', 'h2', 'h3', 'li']):
-        text = block.get_text(separator=' ', strip=True)
         if not text:
             continue
@@ -71,15 +48,17 @@ def parse_html_words(html_file):
             word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
             word_entries.append({
                 "text": word,
-                "bbox": word_bbox
             })
             words_json.append({
                 "text": word,
-                "bbox": word_bbox
             })
             x_offset += char_width * (len(word) + 1)
-        lines_json.append({
             "text": text,
             "bbox": line_bbox,
             "words": word_entries
@@ -89,13 +68,12 @@ def parse_html_words(html_file):
     output_json = {
         "words": words_json,
-        "lines": lines_json,
-        "images_found": len(images_in_html)
     }
-    return output_json, images_in_html
-# ----------------- Image Utilities -----------------
 def load_image(image_file, image_url):
     if image_file:
         return [image_file]
@@ -104,19 +82,19 @@ def load_image(image_file, image_url):
         return [Image.open(BytesIO(response.content)).convert("RGB")]
     return []
-# ----------------- Main Combined Logic -----------------
 def detect_text_combined(image_file, image_url, html_file):
-    # HTML path
     if html_file:
-        output_json, images_in_html = parse_html_words(html_file)
         json_str = json.dumps(output_json, indent=2)
         tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
         tmp_file.write(json_str)
         tmp_file.close()
-        annotated_image = images_in_html[0] if images_in_html else None
         return annotated_image, json_str, tmp_file.name
-    # Image path
     images = load_image(image_file, image_url)
     if not images:
         return None, "No input provided.", None
@@ -168,12 +146,12 @@ iface = gr.Interface(
         gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
     ],
     outputs=[
-        gr.Image(type="pil", label="Annotated Image / First Extracted Image for HTML"),
         gr.Textbox(label="JSON Output"),
         gr.File(label="Download JSON")
     ],
     title="Combined OCR & HTML Text Bounding Box Extractor",
-    description="Upload an image, provide an image URL, or upload an HTML file. Outputs word- and line-level bounding boxes in JSON with annotated images for images."
 )
 if __name__ == "__main__":

 import base64
 import re
+# ----------------- Initialize OCR -----------------
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 reader = easyocr.Reader(['en'])
+# ----------------- HTML Parsing -----------------
+def parse_html_to_json(html_file):
+    """Extract words and paragraphs from HTML in the same structure as image OCR"""
     if hasattr(html_file, "read"):
         html_content = html_file.read()
         if isinstance(html_content, bytes):
     else:
         html_content = str(html_file)
     soup = BeautifulSoup(html_content, "html.parser")
     words_json = []
+    paragraphs_json = []
     y_offset = 0
     line_height = 20
     char_width = 10
+    for tag in soup.find_all(True):  # All tags
+        text = tag.get_text(separator=' ', strip=True)
         if not text:
             continue
             word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
             word_entries.append({
                 "text": word,
+                "bbox": word_bbox,
+                "confidence": 1.0
             })
             words_json.append({
                 "text": word,
+                "bbox": word_bbox,
+                "confidence": 1.0
             })
             x_offset += char_width * (len(word) + 1)
+        paragraphs_json.append({
             "text": text,
             "bbox": line_bbox,
             "words": word_entries
     output_json = {
         "words": words_json,
+        "paragraphs": paragraphs_json
     }
+    return output_json
+# ----------------- Image Loading -----------------
 def load_image(image_file, image_url):
     if image_file:
         return [image_file]
         return [Image.open(BytesIO(response.content)).convert("RGB")]
     return []
+# ----------------- Main Logic -----------------
 def detect_text_combined(image_file, image_url, html_file):
+    # ----------------- HTML Path -----------------
     if html_file:
+        output_json = parse_html_to_json(html_file)
         json_str = json.dumps(output_json, indent=2)
         tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
         tmp_file.write(json_str)
         tmp_file.close()
+        annotated_image = None
         return annotated_image, json_str, tmp_file.name
+    # ----------------- Image Path -----------------
     images = load_image(image_file, image_url)
     if not images:
         return None, "No input provided.", None
         gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
     ],
     outputs=[
+        gr.Image(type="pil", label="Annotated Image"),
         gr.Textbox(label="JSON Output"),
         gr.File(label="Download JSON")
     ],
     title="Combined OCR & HTML Text Bounding Box Extractor",
+    description="Upload an image, provide an image URL, or upload an HTML file. Outputs word- and paragraph-level bounding boxes in JSON format consistent with image OCR output."
 )
 if __name__ == "__main__":