# -*- coding: utf-8 -*- """ YOLOv10 Single Object Feature Extractor This script extracts features for a specific detected object by its index. It can be used to build feature databases or for targeted object analysis. """ from ultralytics import YOLO from ultralytics.utils.ops import xywh2xyxy, scale_boxes from ultralytics.engine.results import Results import torch import time from torch.nn.functional import cosine_similarity import cv2 import matplotlib.pyplot as plt import numpy as np from pathlib import Path import urllib.request import argparse import json from torchvision.ops import RoIAlign as ROIAlign import torch.nn as nn import torch.nn.functional as F from types import MethodType import torchvision import collections # Monkey patch method to get feature maps def _predict_once(self, x, profile=False, visualize=False, embed=None): y, dt, embeddings = [], [], [] # outputs for m in self.model: if m.f != -1: # if not from previous layer x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers if profile: self._profile_one_layer(m, x, dt) x = m(x) # run y.append(x if m.i in self.save else None) # save output if visualize: feature_visualization(x, m.type, m.i, save_dir=visualize) if embed and m.i in embed: embeddings.append(x) if m.i == max(embed): return embeddings return x def get_yolov10_object_features_with_pooler(feat_list, idxs, boxes, orig_img_shape): """ Extracts object features from YOLOv10 feature maps using RoIAlign. Concatenates features from all levels for each detected object. """ # Assuming input image is resized to 640x640 img_size = 640 # We need to know the downsampling ratio for each feature map # P3 has stride 8, P4 has stride 16, P5 has stride 32 spatial_scales = [1.0 / 8, 1.0 / 16, 1.0 / 32] num_rois = len(boxes) if num_rois == 0: return [torch.empty(0)], [] # Add batch index 0 to boxes for ROIAlign zeros = torch.full((num_rois, 1), 0, device=boxes.device, dtype=boxes.dtype) rois = torch.cat((zeros, boxes), dim=1) poolers = [ ROIAlign(output_size=[7, 7], spatial_scale=ss, sampling_ratio=2) for ss in spatial_scales ] pooled_feats = [] for feat_map, pooler in zip(feat_list, poolers): pooled_feats.append(pooler(feat_map, rois)) avg_pool = nn.AdaptiveAvgPool2d((1, 1)) pooled_feats_flat = [avg_pool(pf).view(num_rois, -1) for pf in pooled_feats] # Concatenate features from all levels final_feats = torch.cat(pooled_feats_flat, dim=1) return [final_feats], pooled_feats def get_result_with_features_yolov10_simple(model, imgs, embed_layers, conf=0.25): """ Simplified approach: Use standard YOLO inference first, then extract features. """ if not isinstance(imgs, list): imgs = [imgs] # First, run standard inference to get proper Results objects results = model(imgs, verbose=False, conf=conf) # Then extract features for each detected object for i, result in enumerate(results): if hasattr(result, 'boxes') and len(result.boxes) > 0: # Get the preprocessed image that was used for inference prepped = model.predictor.preprocess([result.orig_img]) # --- Temporarily set the embed layers --- # Save the previous setting so we can restore it afterwards. Leaving a non-None # value in `model.predictor.args.embed` would cause the model to return raw # feature maps (instead of standard detection outputs) on the *next* call, # which results in missing detections for every image processed after the # first one. Restoring the value here ensures normal behaviour for the # following iterations. prev_embed = getattr(model.predictor.args, "embed", None) model.predictor.args.embed = embed_layers # Call inference with embedding to get feature maps features = model.predictor.inference(prepped) # Restore previous embed setting model.predictor.args.embed = prev_embed # The feature maps are all but the last element of the result feature_maps = features[:-1] # Extract features for each detected box boxes_scaled = result.boxes.xyxy # Scale boxes to the preprocessed image size for feature extraction boxes_for_features = scale_boxes(result.orig_img.shape, boxes_scaled.clone(), prepped.shape[2:]) # Create dummy indices (we're not using NMS indices here) dummy_idxs = [torch.arange(len(boxes_for_features))] # Get features obj_feats, pooled_feats = get_yolov10_object_features_with_pooler(feature_maps, dummy_idxs, boxes_for_features, result.orig_img.shape) # Add features to the result result.feats = obj_feats[0] if obj_feats else torch.empty(0) result.pooled_feats = pooled_feats return results def draw_debug_image(img, boxes, class_names, save_path="debug_detections.png", highlight_idx=None): """Draw bounding boxes on the original image for debugging.""" debug_img = img.copy() for i, box in enumerate(boxes): x1, y1, x2, y2 = box.cpu().numpy().astype(int) # Clip coordinates to image bounds x1, y1 = max(0, x1), max(0, y1) x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2) # Highlight the selected object color = (0, 0, 255) if i == highlight_idx else (0, 255, 0) # Red for selected, green for others thickness = 3 if i == highlight_idx else 2 cv2.rectangle(debug_img, (x1, y1), (x2, y2), color, thickness) cv2.putText(debug_img, f"{class_names[i]} #{i}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2) cv2.imwrite(save_path, debug_img) print(f"Debug image with bounding boxes saved to {save_path}") return debug_img def draw_feature_heatmap(image, box, feature_map): """ Draws a feature map as a heatmap on a specific region of an image. """ # Detach and move feature map to CPU feature_map = feature_map.detach().cpu() # Average features across channels to get a 2D heatmap heatmap = torch.mean(feature_map, dim=0).numpy() # Normalize heatmap to 0-255 if np.max(heatmap) > np.min(heatmap): heatmap = (heatmap - np.min(heatmap)) / (np.max(heatmap) - np.min(heatmap)) heatmap = (heatmap * 255).astype(np.uint8) # Get bounding box coordinates x1, y1, x2, y2 = box.cpu().numpy().astype(int) x1, y1 = max(0, x1), max(0, y1) x2, y2 = min(image.shape[1], x2), min(image.shape[0], y2) bbox_w, bbox_h = x2 - x1, y2 - y1 if bbox_w <= 0 or bbox_h <= 0: return image # return original image # Resize heatmap to bounding box size heatmap_resized = cv2.resize(heatmap, (bbox_w, bbox_h), interpolation=cv2.INTER_LINEAR) # Apply colormap heatmap_colored = cv2.applyColorMap(heatmap_resized, cv2.COLORMAP_JET) # Get the region of interest from the original image roi = image[y1:y2, x1:x2] # Blend heatmap with ROI overlay = cv2.addWeighted(roi, 0.6, heatmap_colored, 0.4, 0) # Place the overlay back onto the image output_image = image.copy() output_image[y1:y2, x1:x2] = overlay return output_image def draw_filled_rounded_rectangle(img, pt1, pt2, color, radius): """Draws a filled rounded rectangle.""" x1, y1 = pt1 x2, y2 = pt2 # Draw circles at the corners cv2.circle(img, (x1 + radius, y1 + radius), radius, color, -1) cv2.circle(img, (x2 - radius, y1 + radius), radius, color, -1) cv2.circle(img, (x1 + radius, y2 - radius), radius, color, -1) cv2.circle(img, (x2 - radius, y2 - radius), radius, color, -1) # Draw the central rectangles cv2.rectangle(img, (x1 + radius, y1), (x2 - radius, y2), color, -1) cv2.rectangle(img, (x1, y1 + radius), (x2, y2 - radius), color, -1) def draw_modern_bbox(image, box, label, color): """Draws a modern-style bounding box with a semi-transparent, rounded label.""" x1, y1, x2, y2 = box.astype(int) # Draw the main bounding box outline cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=2) # --- Label --- font = cv2.FONT_HERSHEY_SIMPLEX font_scale = 0.5 font_thickness = 1 (text_w, text_h), _ = cv2.getTextSize(label, font, font_scale, font_thickness) # Define label background position, handling top-of-image cases label_bg_pt1 = (x1, y1 - text_h - 15) label_bg_pt2 = (x1 + text_w + 10, y1) if label_bg_pt1[1] < 0: label_bg_pt1 = (x1, y1 + 5) label_bg_pt2 = (x1 + text_w + 10, y1 + text_h + 20) # Create an overlay for the semi-transparent background overlay = image.copy() # Draw the filled rounded rectangle on the overlay draw_filled_rounded_rectangle(overlay, label_bg_pt1, label_bg_pt2, color, radius=8) # Blend the overlay with the main image alpha = 0.6 cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0, image) # Define text position and draw it on the blended image text_pt = (label_bg_pt1[0] + 5, label_bg_pt1[1] + text_h + 5) cv2.putText(image, label, text_pt, font, font_scale, (0, 0, 0), font_thickness, cv2.LINE_AA) def generate_feature_heatmaps(model, img_path, embed_layers, output_dir="./", conf=0.25): """ Generates a single composite image containing the main image with bounding boxes and separate heatmap snippets for each detected object. Args: model: YOLOv10 model img_path: Path to the input image embed_layers: List of layer indices to extract features from output_dir: Directory to save outputs conf: Object detection confidence threshold """ # Load image img = cv2.imread(img_path) if img is None: raise FileNotFoundError(f"Could not read image at {img_path}") print(f"Processing image: {img_path}") # Get results with features results_with_feat = get_result_with_features_yolov10_simple(model, img_path, embed_layers, conf=conf) if not results_with_feat or not isinstance(results_with_feat, list) or len(results_with_feat) == 0: print("No results returned.") return result = results_with_feat[0] if not hasattr(result, 'boxes') or len(result.boxes) == 0: print("No objects detected in the image.") return num_objects = len(result.boxes) print(f"Total objects detected: {num_objects}. Generating composite layout...") # Get class names all_class_names = [model.model.names[int(cls)] for cls in result.boxes.cls] # --- Step 1: Create the main image with modern bounding boxes --- main_image_with_boxes = img.copy() colors = [(71, 224, 253), (159, 128, 255), (159, 227, 128), (255, 191, 0), (255, 165, 0), (255, 0, 255)] for i in range(num_objects): label = f"{all_class_names[i]} {result.boxes.conf[i]:.2f}" color = colors[i % len(colors)] draw_modern_bbox(main_image_with_boxes, result.boxes.xyxy[i].cpu().numpy(), label, color) # --- Step 2: Generate individual heatmap snippets for each object --- heatmap_snippets = [] if hasattr(result, 'pooled_feats') and result.pooled_feats: last_layer_pooled_feats = result.pooled_feats[-1] for i in range(num_objects): box = result.boxes.xyxy[i] feature_map = last_layer_pooled_feats[i] heatmap_on_full = draw_feature_heatmap(img.copy(), box, feature_map) x1, y1, x2, y2 = box.cpu().numpy().astype(int) snippet = heatmap_on_full[y1:y2, x1:x2] label_text = f"Obj #{i}: {all_class_names[i]}" font = cv2.FONT_HERSHEY_SIMPLEX (text_w, text_h), _ = cv2.getTextSize(label_text, font, 0.6, 1) h, w, _ = snippet.shape # Make the snippet canvas wide enough for the text label new_w = max(w, text_w + 10) snippet_with_label = np.full((h + text_h + 15, new_w, 3), 255, dtype=np.uint8) # Paste the snippet (centered) onto the new canvas paste_x = (new_w - w) // 2 snippet_with_label[0:h, paste_x:paste_x+w] = snippet # Draw the label text (centered) text_x = (new_w - text_w) // 2 cv2.putText(snippet_with_label, label_text, (text_x, h + text_h + 5), font, 0.6, (0,0,0), 1, cv2.LINE_AA) cv2.rectangle(snippet_with_label, (0,0), (new_w-1, h+text_h+14), (180,180,180), 1) heatmap_snippets.append(snippet_with_label) if not heatmap_snippets: print("No heatmaps generated. Saving image with bounding boxes only.") image_name = Path(img_path).stem save_path = Path(output_dir) / f"{image_name}_layout.png" cv2.imwrite(str(save_path), main_image_with_boxes) return # --- Step 3: Arrange snippets and main image into a final composite image --- main_h, main_w, _ = main_image_with_boxes.shape padding = 20 # Arrange snippets into a horizontal row snippets_row_h = max(s.shape[0] for s in heatmap_snippets) total_snippets_w = sum(s.shape[1] for s in heatmap_snippets) + (len(heatmap_snippets) - 1) * 10 snippets_row = np.full((snippets_row_h, total_snippets_w, 3), 255, dtype=np.uint8) current_x = 0 for snippet in heatmap_snippets: h, w, _ = snippet.shape paste_y = (snippets_row_h - h) // 2 snippets_row[paste_y:paste_y+h, current_x:current_x+w] = snippet current_x += w + 10 # Create the final canvas and place the main image and the snippet row canvas_h = main_h + snippets_row_h + 3 * padding canvas_w = max(main_w, total_snippets_w) + 2 * padding final_image = np.full((canvas_h, canvas_w, 3), 255, dtype=np.uint8) # Paste main image at top-center x_offset_main = (canvas_w - main_w) // 2 final_image[padding:padding+main_h, x_offset_main:x_offset_main+main_w] = main_image_with_boxes # Paste snippet row at bottom-center x_offset_snippets = (canvas_w - total_snippets_w) // 2 y_offset_snippets = main_h + 2 * padding final_image[y_offset_snippets:y_offset_snippets+snippets_row_h, x_offset_snippets:x_offset_snippets+total_snippets_w] = snippets_row # --- Step 4: Save the final composite image --- image_name = Path(img_path).stem heatmap_path = Path(output_dir) / f"{image_name}_heatmap_layout.png" cv2.imwrite(str(heatmap_path), final_image) print(f" - Saved composite heatmap layout to: {heatmap_path}") def main(): parser = argparse.ArgumentParser(description='Generate a composite feature heatmap for all detected objects in an image or a directory of images.') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--image', '-i', type=str, help='Path to a single input image.') group.add_argument('--input-dir', '-d', type=str, help='Path to a directory of input images.') parser.add_argument('--model', '-m', type=str, default='yolov10n.pt', help='Path to YOLOv10 model') parser.add_argument('--output', '-o', type=str, default='./heatmaps', help='Output directory for generated layouts.') parser.add_argument('--conf', type=float, default=0.25, help='Object detection confidence threshold (e.g., 0.1 for more detections).') args = parser.parse_args() # Create output directory if it doesn't exist Path(args.output).mkdir(parents=True, exist_ok=True) # Load YOLOv10 model print(f"Loading model: {args.model}") model = YOLO(args.model) # Monkey patch the model's prediction method model.model._predict_once = MethodType(_predict_once, model.model) # Initialize the predictor by running a dummy inference model(np.zeros((640, 640, 3)), verbose=False) # Dynamically find the feature map layer indices from the model detect_layer_index = -1 for i, m in enumerate(model.model.model): if 'Detect' in type(m).__name__: detect_layer_index = i break if detect_layer_index != -1: input_layers_indices = model.model.model[detect_layer_index].f embed_layers = sorted(input_layers_indices) + [detect_layer_index] print(f"Auto-detected feature layers at indices: {input_layers_indices}") print(f"Embedding features from layers: {embed_layers}") else: print("Could not find Detect layer, falling back to hardcoded indices") embed_layers = [16, 19, 22, 23] # Process either a single image or a directory of images if args.input_dir: input_path = Path(args.input_dir) image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tif', '*.tiff'] image_files = [] for ext in image_extensions: image_files.extend(input_path.glob(ext)) if not image_files: print(f"No images found in '{args.input_dir}'.") return print(f"\nFound {len(image_files)} images in '{args.input_dir}'. Processing...") for img_path in image_files: generate_feature_heatmaps( model=model, img_path=str(img_path), embed_layers=embed_layers, output_dir=args.output, conf=args.conf ) else: # if args.image generate_feature_heatmaps( model=model, img_path=args.image, embed_layers=embed_layers, output_dir=args.output, conf=args.conf ) print(f"\nProcessing complete. All layouts saved to '{args.output}'.") if __name__ == "__main__": # If run without arguments, use test image import sys if len(sys.argv) == 1: print("No arguments provided. Running heatmap generation on a test image.") # Load YOLOv10 model print("Loading default model: yolov10n.pt") model = YOLO('yolov10n.pt') model.model._predict_once = MethodType(_predict_once, model.model) model(np.zeros((640, 640, 3)), verbose=False) # Auto-detect layers detect_layer_index = -1 for i, m in enumerate(model.model.model): if 'Detect' in type(m).__name__: detect_layer_index = i break if detect_layer_index != -1: input_layers_indices = model.model.model[detect_layer_index].f embed_layers = sorted(input_layers_indices) + [detect_layer_index] print(f"Auto-detected feature layers at indices: {input_layers_indices}") else: embed_layers = [16, 19, 22, 23] # Define test image path img_path = "/home/hew/yolov10FX_obj/id-1.jpg" # Generate heatmaps for the test image print("Using a lower confidence of 0.1 for test mode to find more objects.") generate_feature_heatmaps( model=model, img_path=img_path, embed_layers=embed_layers, output_dir="./", conf=0.1 ) print(f"\nHeatmap generation completed successfully for test image!") else: main()