Spaces:
Runtime error
Runtime error
| from typing import Tuple, List | |
| import cv2 | |
| import numpy as np | |
| import supervision as sv | |
| import torch | |
| from PIL import Image | |
| from torchvision.ops import box_convert | |
| import groundingdino.datasets.transforms as T | |
| from groundingdino.models import build_model | |
| from groundingdino.util.misc import clean_state_dict | |
| from groundingdino.util.slconfig import SLConfig | |
| from groundingdino.util.utils import get_phrases_from_posmap | |
| def preprocess_caption(caption: str) -> str: | |
| result = caption.lower().strip() | |
| if result.endswith("."): | |
| return result | |
| return result + "." | |
| def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"): | |
| args = SLConfig.fromfile(model_config_path) | |
| args.device = device | |
| model = build_model(args) | |
| checkpoint = torch.load(model_checkpoint_path, map_location="cpu") | |
| model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False) | |
| model.eval() | |
| return model | |
| def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]: | |
| transform = T.Compose( | |
| [ | |
| T.RandomResize([800], max_size=1333), | |
| T.ToTensor(), | |
| T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), | |
| ] | |
| ) | |
| image_source = Image.open(image_path).convert("RGB") | |
| image = np.asarray(image_source) | |
| image_transformed, _ = transform(image_source, None) | |
| return image, image_transformed | |
| def predict( | |
| model, | |
| image: torch.Tensor, | |
| caption: str, | |
| box_threshold: float, | |
| text_threshold: float, | |
| device: str = "cuda" | |
| ) -> Tuple[torch.Tensor, torch.Tensor, List[str]]: | |
| caption = preprocess_caption(caption=caption) | |
| model = model.to(device) | |
| image = image.to(device) | |
| with torch.no_grad(): | |
| outputs = model(image[None], captions=[caption]) | |
| prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0] # prediction_logits.shape = (nq, 256) | |
| prediction_boxes = outputs["pred_boxes"].cpu()[0] # prediction_boxes.shape = (nq, 4) | |
| mask = prediction_logits.max(dim=1)[0] > box_threshold | |
| logits = prediction_logits[mask] # logits.shape = (n, 256) | |
| boxes = prediction_boxes[mask] # boxes.shape = (n, 4) | |
| tokenizer = model.tokenizer | |
| tokenized = tokenizer(caption) | |
| phrases = [ | |
| get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '') | |
| for logit | |
| in logits | |
| ] | |
| return boxes, logits.max(dim=1)[0], phrases | |
| def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray: | |
| h, w, _ = image_source.shape | |
| boxes = boxes * torch.Tensor([w, h, w, h]) | |
| xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() | |
| detections = sv.Detections(xyxy=xyxy) | |
| labels = [ | |
| f"{phrase} {logit:.2f}" | |
| for phrase, logit | |
| in zip(phrases, logits) | |
| ] | |
| box_annotator = sv.BoxAnnotator() | |
| annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR) | |
| annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) | |
| return annotated_frame | |