import base64 import re from io import BytesIO from typing import List, Tuple, Optional import gradio as gr import requests from PIL import Image from huggingface_hub import InferenceClient # Hugging Face Inference Client (uses the free Inference API) client = InferenceClient(model="Qwen/Qwen2.5-VL-32B-Instruct", provider="hf-inference") BOX_TAG_PATTERN = r"\((\d+),(\d+),(\d+),(\d+)\):([^<]+)" def parse_bounding_boxes(text: str) -> List[Tuple[Tuple[int, int, int, int], str]]: matches = re.findall(BOX_TAG_PATTERN, text) return [((int(x1), int(y1), int(x2), int(y2)), label.strip()) for x1, y1, x2, y2, label in matches] def fetch_image_from_url(url: str) -> Image.Image: resp = requests.get(url, timeout=10) resp.raise_for_status() return Image.open(BytesIO(resp.content)).convert("RGB") def pil_to_data_uri(img: Image.Image) -> str: buffer = BytesIO() img.save(buffer, format="PNG") return "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode() def predict(image: Optional[Image.Image], image_url: str): if image is None and not image_url: return None, "❌ Please provide an image or URL." if image is None: try: image = fetch_image_from_url(image_url) data_uri = image_url except Exception as e: return None, f"❌ {e}" else: image = image.convert("RGB") data_uri = pil_to_data_uri(image) prompt = ( "Detect all objects in the provided image and output their bounding box " "coordinates and class labels in the format (x1,y1,x2,y2):class_label. " "If multiple objects are detected, list each bounding box and class label in a new tag. " "Do not include any other text or descriptions." ) stream = client.chat.completions.create( messages=[ {"role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": data_uri}}, ]} ], stream=True, ) response_text = "".join(chunk.choices[0].delta.content or "" for chunk in stream) bboxes = parse_bounding_boxes(response_text) if not bboxes: return None, "⚠️ No objects detected." annotations = [(bbox, label) for bbox, label in bboxes] return (image, annotations), "✅ Detection complete." def build_demo(): theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald") with gr.Blocks(theme=theme, title="Qwen Object Detection Demo") as demo: gr.Markdown("## Qwen2.5‑VL Object Detection Demo 🎯") gr.Markdown("Upload an image **or** paste an image URL, then click **Detect Objects 🚀**.") gr.Markdown("[Check out the model](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct)") with gr.Tabs(): with gr.TabItem("Upload Image"): img_input = gr.Image(type="pil", label="Upload Image", height=300) gr.Examples( examples=[ ["./example_images/example_1.png"], ["./example_images/example_2.jpg"], ], inputs=[img_input], label="Click an example to try 👇", ) with gr.TabItem("Image URL"): url_input = gr.Textbox(label="Image URL", placeholder="https://example.com/img.jpg") gr.Examples( examples=[ [None, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/google-cloud/model-card.png"], [None, "http://images.cocodataset.org/val2017/000000039769.jpg"], ], inputs=[img_input, url_input], label="Click an example to try 👇", ) detect_btn = gr.Button("Detect Objects 🚀") output_img = gr.AnnotatedImage(label="Detections", height=600) status = gr.Markdown() detect_btn.click(predict, inputs=[img_input, url_input], outputs=[output_img, status]) return demo def main(): demo = build_demo() demo.launch() if __name__ == "__main__": main()