qwen-od / app.py
ariG23498's picture
ariG23498 HF Staff
gui-update (#1)
ec8cc88 verified
import base64
import re
from io import BytesIO
from typing import List, Tuple, Optional
import gradio as gr
import requests
from PIL import Image
from huggingface_hub import InferenceClient
# Hugging Face Inference Client (uses the free Inference API)
client = InferenceClient(model="Qwen/Qwen2.5-VL-32B-Instruct", provider="hf-inference")
BOX_TAG_PATTERN = r"<box>\((\d+),(\d+),(\d+),(\d+)\):([^<]+)</box>"
def parse_bounding_boxes(text: str) -> List[Tuple[Tuple[int, int, int, int], str]]:
matches = re.findall(BOX_TAG_PATTERN, text)
return [((int(x1), int(y1), int(x2), int(y2)), label.strip()) for x1, y1, x2, y2, label in matches]
def fetch_image_from_url(url: str) -> Image.Image:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
return Image.open(BytesIO(resp.content)).convert("RGB")
def pil_to_data_uri(img: Image.Image) -> str:
buffer = BytesIO()
img.save(buffer, format="PNG")
return "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode()
def predict(image: Optional[Image.Image], image_url: str):
if image is None and not image_url:
return None, "❌ Please provide an image or URL."
if image is None:
try:
image = fetch_image_from_url(image_url)
data_uri = image_url
except Exception as e:
return None, f"❌ {e}"
else:
image = image.convert("RGB")
data_uri = pil_to_data_uri(image)
prompt = (
"Detect all objects in the provided image and output their bounding box "
"coordinates and class labels in the format <box>(x1,y1,x2,y2):class_label</box>. "
"If multiple objects are detected, list each bounding box and class label in a new <box> tag. "
"Do not include any other text or descriptions."
)
stream = client.chat.completions.create(
messages=[
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": data_uri}},
]}
],
stream=True,
)
response_text = "".join(chunk.choices[0].delta.content or "" for chunk in stream)
bboxes = parse_bounding_boxes(response_text)
if not bboxes:
return None, "⚠️ No objects detected."
annotations = [(bbox, label) for bbox, label in bboxes]
return (image, annotations), "βœ… Detection complete."
def build_demo():
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
with gr.Blocks(theme=theme, title="Qwen Object Detection Demo") as demo:
gr.Markdown("## Qwen2.5‑VL Object Detection Demo 🎯")
gr.Markdown("Upload an image **or** paste an image URL, then click **Detect Objects πŸš€**.")
gr.Markdown("[Check out the model](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct)")
with gr.Tabs():
with gr.TabItem("Upload Image"):
img_input = gr.Image(type="pil", label="Upload Image", height=300)
gr.Examples(
examples=[
["./example_images/example_1.png"],
["./example_images/example_2.jpg"],
],
inputs=[img_input],
label="Click an example to try πŸ‘‡",
)
with gr.TabItem("Image URL"):
url_input = gr.Textbox(label="Image URL", placeholder="https://example.com/img.jpg")
gr.Examples(
examples=[
[None, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/google-cloud/model-card.png"],
[None, "http://images.cocodataset.org/val2017/000000039769.jpg"],
],
inputs=[img_input, url_input],
label="Click an example to try πŸ‘‡",
)
detect_btn = gr.Button("Detect Objects πŸš€")
output_img = gr.AnnotatedImage(label="Detections", height=600)
status = gr.Markdown()
detect_btn.click(predict, inputs=[img_input, url_input], outputs=[output_img, status])
return demo
def main():
demo = build_demo()
demo.launch()
if __name__ == "__main__":
main()