File size: 2,023 Bytes
1c4e9d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
from PIL import Image, ImageDraw
import requests
from io import BytesIO
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

# Load OCR model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

def load_image(image_file, image_url):
    """
    Load image from file or URL.
    """
    if image_file:
        return image_file
    elif image_url:
        response = requests.get(image_url)
        return Image.open(BytesIO(response.content)).convert("RGB")
    else:
        return None

def detect_text(image_file, image_url):
    """
    Detect text in an image and return annotated image + text coordinates.
    """
    image = load_image(image_file, image_url)
    if image is None:
        return None, "No image provided."

    # Use the OCR processor to get pixel-level data
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # For demonstration: bounding box around the full image (TroCR doesn't return coordinates)
    # For proper coordinates use an OCR model like PaddleOCR or EasyOCR
    draw = ImageDraw.Draw(image)
    w, h = image.size
    draw.rectangle([0, 0, w, h], outline="red", width=3)
    coords_str = f"Full image bounding box: [0,0,{w},{h}]\nDetected text: {text}"

    return image, coords_str

iface = gr.Interface(
    fn=detect_text,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Image URL (optional)")
    ],
    outputs=[
        gr.Image(type="pil", label="Annotated Image"),
        gr.Textbox(label="Detected Text & Coordinates")
    ],
    title="Text Detection from Image",
    description="Upload an image or enter an image URL, and the app will detect text and show bounding boxes."
)

if __name__ == "__main__":
    iface.launch()