Spaces:

rahul7star
/

OCR

Running

App Files Files Community

rahul7star commited on 19 days ago

Commit

051d865

verified ·

1 Parent(s): eeb0aa9

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -20

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 reader = easyocr.Reader(['en'])
 def extract_images_from_html(html_content):
     """Extract images from HTML content (base64 or URLs)"""
     images = []
@@ -37,34 +38,32 @@ def extract_images_from_html(html_content):
                 continue
     return images
-def parse_html_text(html_file):
-    """Parse HTML text and generate approximate bounding boxes"""
-    # Handle different Gradio file types
     if hasattr(html_file, "read"):
         html_content = html_file.read()
         if isinstance(html_content, bytes):
             html_content = html_content.decode("utf-8")
     else:
-        # NamedString object (Gradio v3.40+)
         html_content = str(html_file)
-    # Extract images from HTML (optional, for OCR later)
     images_in_html = extract_images_from_html(html_content)
     soup = BeautifulSoup(html_content, "html.parser")
-    body_text = soup.get_text(separator="\n")
-    lines = [line.strip() for line in body_text.split("\n") if line.strip()]
     words_json = []
     lines_json = []
     y_offset = 0
     line_height = 20
     char_width = 10
-    for line in lines:
-        line_words = line.split()
-        line_bbox = [0, y_offset, char_width * len(line), y_offset + line_height]
         word_entries = []
         x_offset = 0
@@ -81,7 +80,7 @@ def parse_html_text(html_file):
             x_offset += char_width * (len(word) + 1)
         lines_json.append({
-            "text": line,
             "bbox": line_bbox,
             "words": word_entries
         })
@@ -94,8 +93,9 @@ def parse_html_text(html_file):
         "images_found": len(images_in_html)
     }
-    return html_content, output_json, images_in_html
 def load_image(image_file, image_url):
     if image_file:
         return [image_file]
@@ -104,18 +104,16 @@ def load_image(image_file, image_url):
         return [Image.open(BytesIO(response.content)).convert("RGB")]
     return []
 def detect_text_combined(image_file, image_url, html_file):
     # HTML path
     if html_file:
-        html_content, output_json, images_in_html = parse_html_text(html_file)
         json_str = json.dumps(output_json, indent=2)
         tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
         tmp_file.write(json_str)
         tmp_file.close()
-        annotated_image = None
-        if images_in_html:
-            # For demo, show first extracted image if exists
-            annotated_image = images_in_html[0]
         return annotated_image, json_str, tmp_file.name
     # Image path
@@ -161,6 +159,7 @@ def detect_text_combined(image_file, image_url, html_file):
     return annotated_image, json_str, tmp_file.name
 iface = gr.Interface(
     fn=detect_text_combined,
     inputs=[
@@ -169,7 +168,7 @@ iface = gr.Interface(
         gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
     ],
     outputs=[
-        gr.Image(type="pil", label="Annotated Image / N/A for HTML"),
         gr.Textbox(label="JSON Output"),
         gr.File(label="Download JSON")
     ],

 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 reader = easyocr.Reader(['en'])
+# ----------------- HTML Utilities -----------------
 def extract_images_from_html(html_content):
     """Extract images from HTML content (base64 or URLs)"""
     images = []
                 continue
     return images
+def parse_html_words(html_file):
+    """Extract words and lines from HTML with approximate bounding boxes"""
     if hasattr(html_file, "read"):
         html_content = html_file.read()
         if isinstance(html_content, bytes):
             html_content = html_content.decode("utf-8")
     else:
         html_content = str(html_file)
     images_in_html = extract_images_from_html(html_content)
     soup = BeautifulSoup(html_content, "html.parser")
     words_json = []
     lines_json = []
     y_offset = 0
     line_height = 20
     char_width = 10
+    # Traverse visible text blocks
+    for block in soup.find_all(['p', 'div', 'span', 'h1', 'h2', 'h3', 'li']):
+        text = block.get_text(separator=' ', strip=True)
+        if not text:
+            continue
+        line_words = text.split()
+        line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
         word_entries = []
         x_offset = 0
             x_offset += char_width * (len(word) + 1)
         lines_json.append({
+            "text": text,
             "bbox": line_bbox,
             "words": word_entries
         })
         "images_found": len(images_in_html)
     }
+    return output_json, images_in_html
+# ----------------- Image Utilities -----------------
 def load_image(image_file, image_url):
     if image_file:
         return [image_file]
         return [Image.open(BytesIO(response.content)).convert("RGB")]
     return []
+# ----------------- Main Combined Logic -----------------
 def detect_text_combined(image_file, image_url, html_file):
     # HTML path
     if html_file:
+        output_json, images_in_html = parse_html_words(html_file)
         json_str = json.dumps(output_json, indent=2)
         tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
         tmp_file.write(json_str)
         tmp_file.close()
+        annotated_image = images_in_html[0] if images_in_html else None
         return annotated_image, json_str, tmp_file.name
     # Image path
     return annotated_image, json_str, tmp_file.name
+# ----------------- Gradio Interface -----------------
 iface = gr.Interface(
     fn=detect_text_combined,
     inputs=[
         gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
     ],
     outputs=[
+        gr.Image(type="pil", label="Annotated Image / First Extracted Image for HTML"),
         gr.Textbox(label="JSON Output"),
         gr.File(label="Download JSON")
     ],