Spaces:

rahul7star
/

OCR

Sleeping

App Files Files Community

rahul7star commited on 23 days ago

Commit

dc023a9

verified ·

1 Parent(s): e9b8d71

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -53

app.py CHANGED Viewed

@@ -7,82 +7,164 @@ import json
 import tempfile
 import easyocr
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-# TrOCR model for recognition
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
-# EasyOCR reader for bounding boxes
 reader = easyocr.Reader(['en'])
-def load_image(image_file, image_url):
-    if image_file:
-        return image_file
-    elif image_url:
-        response = requests.get(image_url)
-        return Image.open(BytesIO(response.content)).convert("RGB")
-    return None
-def detect_text_trocr_json(image_file, image_url):
-    image = load_image(image_file, image_url)
-    if image is None:
-        return None, "No image provided.", None
-    results = reader.readtext(np.array(image))
-    draw = ImageDraw.Draw(image)
     words_json = []
-    for bbox, _, conf in results:
-        # Convert coordinates to float for JSON serialization
-        x_coords = [float(point[0]) for point in bbox]
-        y_coords = [float(point[1]) for point in bbox]
-        x_min, y_min = min(x_coords), min(y_coords)
-        x_max, y_max = max(x_coords), max(y_coords)
-        # Crop each word for recognition
-        word_crop = image.crop((x_min, y_min, x_max, y_max))
-        pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
-        generated_ids = model.generate(pixel_values)
-        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
-        words_json.append({
-            "text": text,
-            "bbox": [x_min, y_min, x_max, y_max],
-            "confidence": float(conf)
         })
-    # Treat words as paragraphs for simplicity
-    paragraphs_json = words_json.copy()
     output_json = {
         "words": words_json,
-        "paragraphs": paragraphs_json
     }
-    json_str = json.dumps(output_json, indent=2)
-    # Save JSON to a temporary file for Gradio download
-    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w')
-    tmp_file.write(json_str)
-    tmp_file.close()
-    return image, json_str, tmp_file.name
 iface = gr.Interface(
-    fn=detect_text_trocr_json,
     inputs=[
         gr.Image(type="pil", label="Upload Image"),
-        gr.Textbox(label="Image URL (optional)")
     ],
     outputs=[
-        gr.Image(type="pil", label="Annotated Image"),
-        gr.Textbox(label="Text & Bounding Boxes (JSON)"),
         gr.File(label="Download JSON")
     ],
-    title="Handwritten OCR with TrOCR + Bounding Boxes",
-    description="Detect handwritten text and bounding boxes. Uses TrOCR for recognition and EasyOCR for detection."
 )
 if __name__ == "__main__":

 import tempfile
 import easyocr
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from bs4 import BeautifulSoup
+import base64
+import re
+# Initialize OCR models
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 reader = easyocr.Reader(['en'])
+def extract_images_from_html(html_file):
+    """Extract images from HTML file (base64 or URLs)"""
+    images = []
+    soup = BeautifulSoup(html_file.read(), "html.parser")
+    for img_tag in soup.find_all("img"):
+        src = img_tag.get("src")
+        if not src:
+            continue
+        if src.startswith("data:image"):
+            b64_data = re.sub(r"^data:image/.+;base64,", "", src)
+            image = Image.open(BytesIO(base64.b64decode(b64_data))).convert("RGB")
+            images.append(image)
+        else:
+            try:
+                response = requests.get(src)
+                image = Image.open(BytesIO(response.content)).convert("RGB")
+                images.append(image)
+            except:
+                continue
+    return images
+def parse_html_text(html_file):
+    """Parse HTML text and generate approximate bounding boxes"""
+    html_content = html_file.read().decode("utf-8")
+    soup = BeautifulSoup(html_content, "html.parser")
+    body_text = soup.get_text(separator="\n")
+    lines = [line.strip() for line in body_text.split("\n") if line.strip()]
     words_json = []
+    lines_json = []
+    y_offset = 0
+    line_height = 20
+    char_width = 10
+    for line in lines:
+        line_words = line.split()
+        line_bbox = [0, y_offset, char_width * len(line), y_offset + line_height]
+        word_entries = []
+        x_offset = 0
+        for word in line_words:
+            word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
+            word_entries.append({
+                "text": word,
+                "bbox": word_bbox
+            })
+            words_json.append({
+                "text": word,
+                "bbox": word_bbox
+            })
+            x_offset += char_width * (len(word) + 1)
+        lines_json.append({
+            "text": line,
+            "bbox": line_bbox,
+            "words": word_entries
         })
+        y_offset += line_height
     output_json = {
         "words": words_json,
+        "lines": lines_json
     }
+    return html_content, output_json
+def load_image(image_file, image_url):
+    if image_file:
+        return [image_file]
+    elif image_url:
+        response = requests.get(image_url)
+        return [Image.open(BytesIO(response.content)).convert("RGB")]
+    return []
+def detect_text_combined(image_file, image_url, html_file):
+    # HTML path
+    if html_file:
+        html_content, output_json = parse_html_text(html_file)
+        json_str = json.dumps(output_json, indent=2)
+        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
+        tmp_file.write(json_str)
+        tmp_file.close()
+        return html_content, json_str, tmp_file.name
+    # Image path
+    images = load_image(image_file, image_url)
+    if not images:
+        return None, "No input provided.", None
+    all_output_json = []
+    annotated_images = []
+    for image in images:
+        results = reader.readtext(np.array(image))
+        draw = ImageDraw.Draw(image)
+        words_json = []
+        for bbox, _, conf in results:
+            x_coords = [float(point[0]) for point in bbox]
+            y_coords = [float(point[1]) for point in bbox]
+            x_min, y_min = min(x_coords), min(y_coords)
+            x_max, y_max = max(x_coords), max(y_coords)
+            # Crop word for TrOCR recognition
+            word_crop = image.crop((x_min, y_min, x_max, y_max))
+            pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
+            generated_ids = model.generate(pixel_values)
+            text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
+            words_json.append({
+                "text": text,
+                "bbox": [x_min, y_min, x_max, y_max],
+                "confidence": float(conf)
+            })
+        paragraphs_json = words_json.copy()
+        output_json = {
+            "words": words_json,
+            "paragraphs": paragraphs_json
+        }
+        json_str = json.dumps(output_json, indent=2)
+        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
+        tmp_file.write(json_str)
+        tmp_file.close()
+        annotated_images.append((image, json_str, tmp_file.name))
+    # Return first image for simplicity (can extend to gallery)
+    return annotated_images[0]
 iface = gr.Interface(
+    fn=detect_text_combined,
     inputs=[
         gr.Image(type="pil", label="Upload Image"),
+        gr.Textbox(label="Image URL (optional)"),
+        gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
     ],
     outputs=[
+        gr.Image(type="pil", label="Annotated Image / N/A for HTML"),
+        gr.Textbox(label="JSON Output"),
         gr.File(label="Download JSON")
     ],
+    title="Combined OCR & HTML Text Bounding Box Extractor",
+    description="Upload an image, provide an image URL, or upload an HTML file. Outputs word- and line-level bounding boxes in JSON with annotated images for images."
 )
 if __name__ == "__main__":