Spaces:

rahul7star
/

OCR

Sleeping

App Files Files Community

rahul7star commited on 23 days ago

Commit

0752ecf

verified ·

1 Parent(s): 755bf15

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -27

app.py CHANGED Viewed

@@ -1,59 +1,81 @@
-import gradio as gr
 from PIL import Image, ImageDraw
 import requests
 from io import BytesIO
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-# Load OCR model
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 def load_image(image_file, image_url):
-    """
-    Load image from file or URL.
-    """
     if image_file:
         return image_file
     elif image_url:
         response = requests.get(image_url)
         return Image.open(BytesIO(response.content)).convert("RGB")
-    else:
-        return None
-def detect_text(image_file, image_url):
-    """
-    Detect text in an image and return annotated image + text coordinates.
-    """
     image = load_image(image_file, image_url)
     if image is None:
-        return None, "No image provided."
-    # Use the OCR processor to get pixel-level data
-    pixel_values = processor(images=image, return_tensors="pt").pixel_values
-    generated_ids = model.generate(pixel_values)
-    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    # For demonstration: bounding box around the full image (TroCR doesn't return coordinates)
-    # For proper coordinates use an OCR model like PaddleOCR or EasyOCR
     draw = ImageDraw.Draw(image)
-    w, h = image.size
-    draw.rectangle([0, 0, w, h], outline="red", width=3)
-    coords_str = f"Full image bounding box: [0,0,{w},{h}]\nDetected text: {text}"
-    return image, coords_str
 iface = gr.Interface(
-    fn=detect_text,
     inputs=[
         gr.Image(type="pil", label="Upload Image"),
         gr.Textbox(label="Image URL (optional)")
     ],
     outputs=[
         gr.Image(type="pil", label="Annotated Image"),
-        gr.Textbox(label="Detected Text & Coordinates")
     ],
-    title="Text Detection from Image",
-    description="Upload an image or enter an image URL, and the app will detect text and show bounding boxes."
 )
 if __name__ == "__main__":

+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+import easyocr
 from PIL import Image, ImageDraw
+import numpy as np
+import gradio as gr
 import requests
 from io import BytesIO
+import json
+# TrOCR model for recognition
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
+# EasyOCR reader for bounding boxes
+reader = easyocr.Reader(['en'])
 def load_image(image_file, image_url):
     if image_file:
         return image_file
     elif image_url:
         response = requests.get(image_url)
         return Image.open(BytesIO(response.content)).convert("RGB")
+    return None
+def detect_text_trocr_json(image_file, image_url):
     image = load_image(image_file, image_url)
     if image is None:
+        return None, "No image provided.", None
+    # Step 1: Detect bounding boxes with EasyOCR
+    results = reader.readtext(np.array(image))
     draw = ImageDraw.Draw(image)
+    words_json = []
+    paragraph_json = []
+    for bbox, _, conf in results:
+        x_coords = [point[0] for point in bbox]
+        y_coords = [point[1] for point in bbox]
+        x_min, y_min = min(x_coords), min(y_coords)
+        x_max, y_max = max(x_coords), max(y_coords)
+        # Crop each word for recognition
+        word_crop = image.crop((x_min, y_min, x_max, y_max))
+        pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
+        generated_ids = model.generate(pixel_values)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
+        words_json.append({
+            "text": text,
+            "bbox": [x_min, y_min, x_max, y_max],
+            "confidence": float(conf)
+        })
+    paragraph_json = words_json.copy()
+    output_json = {
+        "words": words_json,
+        "paragraphs": paragraph_json
+    }
+    return image, json.dumps(output_json, indent=2), json.dumps(output_json)
 iface = gr.Interface(
+    fn=detect_text_trocr_json,
     inputs=[
         gr.Image(type="pil", label="Upload Image"),
         gr.Textbox(label="Image URL (optional)")
     ],
     outputs=[
         gr.Image(type="pil", label="Annotated Image"),
+        gr.Textbox(label="Text & Bounding Boxes (JSON)"),
+        gr.File(label="Download JSON")
     ],
+    title="Handwritten OCR with TrOCR + Bounding Boxes",
+    description="Detect handwritten text and bounding boxes. Uses TrOCR for recognition and EasyOCR for detection."
 )
 if __name__ == "__main__":