Spaces:

ariG23498
/

qwen-od

Running

File size: 4,275 Bytes

import base64
import re
from io import BytesIO
from typing import List, Tuple, Optional

import gradio as gr
import requests
from PIL import Image
from huggingface_hub import InferenceClient

# Hugging Face Inference Client (uses the free Inference API)
client = InferenceClient(model="Qwen/Qwen2.5-VL-32B-Instruct", provider="hf-inference")

BOX_TAG_PATTERN = r"<box>\((\d+),(\d+),(\d+),(\d+)\):([^<]+)</box>"

def parse_bounding_boxes(text: str) -> List[Tuple[Tuple[int, int, int, int], str]]:
    matches = re.findall(BOX_TAG_PATTERN, text)
    return [((int(x1), int(y1), int(x2), int(y2)), label.strip()) for x1, y1, x2, y2, label in matches]

def fetch_image_from_url(url: str) -> Image.Image:
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    return Image.open(BytesIO(resp.content)).convert("RGB")

def pil_to_data_uri(img: Image.Image) -> str:
    buffer = BytesIO()
    img.save(buffer, format="PNG")
    return "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode()

def predict(image: Optional[Image.Image], image_url: str):
    if image is None and not image_url:
        return None, "❌ Please provide an image or URL."

    if image is None:
        try:
            image = fetch_image_from_url(image_url)
            data_uri = image_url
        except Exception as e:
            return None, f"❌ {e}"
    else:
        image = image.convert("RGB")
        data_uri = pil_to_data_uri(image)

    prompt = (
        "Detect all objects in the provided image and output their bounding box "
        "coordinates and class labels in the format <box>(x1,y1,x2,y2):class_label</box>. "
        "If multiple objects are detected, list each bounding box and class label in a new <box> tag. "
        "Do not include any other text or descriptions."
    )

    stream = client.chat.completions.create(
        messages=[
            {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": data_uri}},
            ]}
        ],
        stream=True,
    )
    response_text = "".join(chunk.choices[0].delta.content or "" for chunk in stream)

    bboxes = parse_bounding_boxes(response_text)
    if not bboxes:
        return None, "⚠️ No objects detected."

    annotations = [(bbox, label) for bbox, label in bboxes]
    return (image, annotations), "✅ Detection complete."


def build_demo():
    theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
    with gr.Blocks(theme=theme, title="Qwen Object Detection Demo") as demo:
        gr.Markdown("## Qwen2.5‑VL Object Detection Demo 🎯")
        gr.Markdown("Upload an image **or** paste an image URL, then click **Detect Objects 🚀**.")
        gr.Markdown("[Check out the model](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct)")

        with gr.Tabs():
            with gr.TabItem("Upload Image"):
                img_input = gr.Image(type="pil", label="Upload Image", height=300)
                gr.Examples(
                    examples=[
                        ["./example_images/example_1.png"],
                        ["./example_images/example_2.jpg"],
                    ],
                    inputs=[img_input],
                    label="Click an example to try 👇",
                )

            with gr.TabItem("Image URL"):
                url_input = gr.Textbox(label="Image URL", placeholder="https://example.com/img.jpg")
                gr.Examples(
                    examples=[
                        [None, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/google-cloud/model-card.png"],
                        [None, "http://images.cocodataset.org/val2017/000000039769.jpg"],
                    ],
                    inputs=[img_input, url_input],
                    label="Click an example to try 👇",
                )

        detect_btn = gr.Button("Detect Objects 🚀")
        output_img = gr.AnnotatedImage(label="Detections", height=600)
        status = gr.Markdown()

        detect_btn.click(predict, inputs=[img_input, url_input], outputs=[output_img, status])
    return demo


def main():
    demo = build_demo()
    demo.launch()

if __name__ == "__main__":
    main()