File size: 4,275 Bytes
7954413
a151d2d
7954413
 
 
a151d2d
b69bd94
 
7954413
a151d2d
ec8cc88
7954413
a151d2d
b69bd94
a151d2d
7954413
a151d2d
ec8cc88
7954413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec8cc88
7954413
 
 
 
 
 
 
 
 
 
 
 
a151d2d
7954413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec8cc88
 
7954413
 
 
 
ec8cc88
 
 
 
 
 
 
 
 
7954413
 
ec8cc88
 
 
 
 
 
 
 
7954413
 
ec8cc88
7954413
 
 
b69bd94
a151d2d
7954413
 
 
 
 
b69bd94
ec8cc88
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import base64
import re
from io import BytesIO
from typing import List, Tuple, Optional

import gradio as gr
import requests
from PIL import Image
from huggingface_hub import InferenceClient

# Hugging Face Inference Client (uses the free Inference API)
client = InferenceClient(model="Qwen/Qwen2.5-VL-32B-Instruct", provider="hf-inference")

BOX_TAG_PATTERN = r"<box>\((\d+),(\d+),(\d+),(\d+)\):([^<]+)</box>"

def parse_bounding_boxes(text: str) -> List[Tuple[Tuple[int, int, int, int], str]]:
    matches = re.findall(BOX_TAG_PATTERN, text)
    return [((int(x1), int(y1), int(x2), int(y2)), label.strip()) for x1, y1, x2, y2, label in matches]

def fetch_image_from_url(url: str) -> Image.Image:
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    return Image.open(BytesIO(resp.content)).convert("RGB")

def pil_to_data_uri(img: Image.Image) -> str:
    buffer = BytesIO()
    img.save(buffer, format="PNG")
    return "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode()

def predict(image: Optional[Image.Image], image_url: str):
    if image is None and not image_url:
        return None, "❌ Please provide an image or URL."

    if image is None:
        try:
            image = fetch_image_from_url(image_url)
            data_uri = image_url
        except Exception as e:
            return None, f"❌ {e}"
    else:
        image = image.convert("RGB")
        data_uri = pil_to_data_uri(image)

    prompt = (
        "Detect all objects in the provided image and output their bounding box "
        "coordinates and class labels in the format <box>(x1,y1,x2,y2):class_label</box>. "
        "If multiple objects are detected, list each bounding box and class label in a new <box> tag. "
        "Do not include any other text or descriptions."
    )

    stream = client.chat.completions.create(
        messages=[
            {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": data_uri}},
            ]}
        ],
        stream=True,
    )
    response_text = "".join(chunk.choices[0].delta.content or "" for chunk in stream)

    bboxes = parse_bounding_boxes(response_text)
    if not bboxes:
        return None, "⚠️ No objects detected."

    annotations = [(bbox, label) for bbox, label in bboxes]
    return (image, annotations), "βœ… Detection complete."


def build_demo():
    theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
    with gr.Blocks(theme=theme, title="Qwen Object Detection Demo") as demo:
        gr.Markdown("## Qwen2.5‑VL Object Detection Demo 🎯")
        gr.Markdown("Upload an image **or** paste an image URL, then click **Detect Objects πŸš€**.")
        gr.Markdown("[Check out the model](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct)")

        with gr.Tabs():
            with gr.TabItem("Upload Image"):
                img_input = gr.Image(type="pil", label="Upload Image", height=300)
                gr.Examples(
                    examples=[
                        ["./example_images/example_1.png"],
                        ["./example_images/example_2.jpg"],
                    ],
                    inputs=[img_input],
                    label="Click an example to try πŸ‘‡",
                )

            with gr.TabItem("Image URL"):
                url_input = gr.Textbox(label="Image URL", placeholder="https://example.com/img.jpg")
                gr.Examples(
                    examples=[
                        [None, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/google-cloud/model-card.png"],
                        [None, "http://images.cocodataset.org/val2017/000000039769.jpg"],
                    ],
                    inputs=[img_input, url_input],
                    label="Click an example to try πŸ‘‡",
                )

        detect_btn = gr.Button("Detect Objects πŸš€")
        output_img = gr.AnnotatedImage(label="Detections", height=600)
        status = gr.Markdown()

        detect_btn.click(predict, inputs=[img_input, url_input], outputs=[output_img, status])
    return demo


def main():
    demo = build_demo()
    demo.launch()

if __name__ == "__main__":
    main()