Spaces:

mrdbourke
/

LLMDet-demo

Running on Zero

App Files Files Community

mrdbourke commited on Jun 6

Commit

ddd1e4a

verified ·

1 Parent(s): 6f0c93e

Create app.py

Browse files

Files changed (1) hide show

app.py +171 -0

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import torch
+from PIL import Image, ImageDraw, ImageFont
+from transformers import GroundingDinoProcessor
+from modeling_grounding_dino import GroundingDinoForObjectDetection
+from PIL import Image, ImageDraw, ImageFont
+from itertools import cycle
+import gradio as gr
+import spaces
+# Load model and processor
+model_id = "fushh7/llmdet_swin_large_hf"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"[INFO] Using device: {DEVICE}")
+print(f"[INFO] Loading model from {model_id}...")
+processor = GroundingDinoProcessor.from_pretrained(model_id)
+model = GroundingDinoForObjectDetection.from_pretrained(model_id).to(DEVICE)
+model.eval();
+print("[INFO] Model loaded successfully.")
+# Pre-defined palette (extend or tweak as you like)
+BOX_COLORS = [
+    "deepskyblue", "red", "lime", "dodgerblue",
+    "cyan", "magenta", "yellow",
+    "orange", "chartreuse"
+]
+def draw_boxes(image, boxes, labels, scores, colors=BOX_COLORS, font_path="arial.ttf", font_size=16):
+    """
+    Draw bounding boxes and labels on a PIL Image.
+    :param image: PIL Image object
+    :param boxes: Iterable of [x_min, y_min, x_max, y_max]
+    :param labels: Iterable of label strings
+    :param scores: Iterable of scalar confidences (0-1)
+    :param colors: List/tuple of colour names or RGB tuples
+    :param font_path: Path to a TTF font for labels
+    :param font_size: Int size of font to use, default 16
+    :return: PIL Image with drawn boxes
+    """
+    # Ensure we can iterate colours indefinitely
+    colour_cycle = cycle(colors)
+    draw = ImageDraw.Draw(image)
+    # Pick a font (fallback to default if missing)
+    try:
+        font = ImageFont.truetype(font_path, size=font_size)
+    except IOError:
+        font = ImageFont.load_default(size=font_size)
+    # Assign a consistent colour per label (optional)
+    label_to_colour = {}
+    for box, label, score in zip(boxes, labels, scores):
+        # Reuse colour if label seen before, else take next from cycle
+        colour = label_to_colour.setdefault(label, next(colour_cycle))
+        x_min, y_min, x_max, y_max = map(int, box)
+        # Draw rectangle
+        draw.rectangle([x_min, y_min, x_max, y_max], outline=colour, width=2)
+        # Compose text
+        text = f"{label} ({score:.3f})"
+        text_size = draw.textbbox((0, 0), text, font=font)[2:]
+        # Draw text background for legibility
+        bg_coords = [x_min, y_min - text_size[1] - 4,
+                     x_min + text_size[0] + 4, y_min]
+        draw.rectangle(bg_coords, fill=colour)
+        # Draw text
+        draw.text((x_min + 2, y_min - text_size[1] - 2),
+                  text, fill="black", font=font)
+    return image
+def resize_image_max_dimension(image, max_size=1024):
+    """
+    Resize an image so that the longest side is at most max_size pixels,
+    while maintaining the aspect ratio.
+    :param image: PIL Image object
+    :param max_size: Maximum dimension in pixels (default: 1024)
+    :return: PIL Image object (resized)
+    """
+    width, height = image.size
+    # Check if resizing is needed
+    if max(width, height) <= max_size:
+        return image
+    # Calculate new dimensions maintaining aspect ratio
+    ratio = max_size / max(width, height)
+    new_width = int(width * ratio)
+    new_height = int(height * ratio)
+    # Resize the image using high-quality resampling
+    return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+@spaces.GPU(duration=120)
+def detect_and_draw(
+    img: Image.Image,
+    text_query: str,
+    box_threshold: float = 0.4,
+    text_threshold: float = 0.3
+) -> Image.Image:
+    """
+    Detect objects described in `text_query`, draw boxes, return the image.
+    Note: `text_query` must be lowercase and each concept ends with a dot
+          (e.g. 'a cat. a remote control.')
+    """
+    # Make sure text is lowered
+    text_query = text_query.lower()
+    # If the image size is too large, we make it smaller
+    img = resize_image_max_dimension(img, max_size=1024)
+    # Preprocess the image
+    inputs = processor(images=img, text=text_query, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    results = processor.post_process_grounded_object_detection(
+        outputs,
+        inputs.input_ids,
+        box_threshold=box_threshold,
+        text_threshold=text_threshold,
+        target_sizes=[img.size[::-1]]
+    )[0]
+    img_out = img.copy()
+    img_out = draw_boxes(
+        img_out,
+        boxes  = results["boxes"].cpu().numpy(),
+        labels = results.get("text_labels", results.get("labels", [])),
+        scores = results["scores"]
+    )
+    return img_out
+# Create Gradio demo
+demo = gr.Interface(
+    fn      = detect_and_draw,
+    inputs  = [
+        gr.Image(type="pil", label="Image"),
+        gr.Textbox(value="",
+                   label="Text Query (lowercase, end each with '.', for example 'a bird. a tree.')"),
+        gr.Slider(0.0, 1.0, 0.4, 0.05, label="Box Threshold"),
+        gr.Slider(0.0, 1.0, 0.3, 0.05, label="Text Threshold")
+    ],
+    outputs = gr.Image(type="pil", label="Detections"),
+    title   = "LLMDet Demo: Open-Vocabulary Grounded Object Detection",
+    description = """Upload an image, enter text queries, and adjust thresholds to see detections.
+    Adapted from LLMDet GitHub repo [Hugging Face demo](https://github.com/iSEE-Laboratory/LLMDet/tree/main/hf_model).
+    See original:
+    * [LLMDet GitHub](https://github.com/iSEE-Laboratory/LLMDet/tree/main?tab=readme-ov-file)
+    * [LLMDet Paper](https://arxiv.org/abs/2501.18954) - LLMDet: Learning Strong Open-Vocabulary Object Detectors under the Supervision of Large Language Models
+    * [LLMDet model checkpoint](https://huggingface.co/fushh7/llmdet_swin_large_hf)
+    """
+)
+demo.launch()