X-YOLOv10 / yolov10_RoIFX.py
HugoHE's picture
Initial commit with code files
1d08579
# -*- coding: utf-8 -*-
"""
YOLOv10 Single Object Feature Extractor
This script extracts features for a specific detected object by its index.
It can be used to build feature databases or for targeted object analysis.
"""
from ultralytics import YOLO
from ultralytics.utils.ops import xywh2xyxy, scale_boxes
from ultralytics.engine.results import Results
import torch
import time
from torch.nn.functional import cosine_similarity
import cv2
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import urllib.request
import argparse
import json
from torchvision.ops import RoIAlign as ROIAlign
import torch.nn as nn
import torch.nn.functional as F
from types import MethodType
import torchvision
import collections
# Monkey patch method to get feature maps
def _predict_once(self, x, profile=False, visualize=False, embed=None):
y, dt, embeddings = [], [], [] # outputs
for m in self.model:
if m.f != -1: # if not from previous layer
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
if profile:
self._profile_one_layer(m, x, dt)
x = m(x) # run
y.append(x if m.i in self.save else None) # save output
if visualize:
feature_visualization(x, m.type, m.i, save_dir=visualize)
if embed and m.i in embed:
embeddings.append(x)
if m.i == max(embed):
return embeddings
return x
def get_yolov10_object_features_with_pooler(feat_list, idxs, boxes, orig_img_shape):
"""
Extracts object features from YOLOv10 feature maps using RoIAlign.
Concatenates features from all levels for each detected object.
"""
# Assuming input image is resized to 640x640
img_size = 640
# We need to know the downsampling ratio for each feature map
# P3 has stride 8, P4 has stride 16, P5 has stride 32
spatial_scales = [1.0 / 8, 1.0 / 16, 1.0 / 32]
num_rois = len(boxes)
if num_rois == 0:
return [torch.empty(0)], []
# Add batch index 0 to boxes for ROIAlign
zeros = torch.full((num_rois, 1), 0, device=boxes.device, dtype=boxes.dtype)
rois = torch.cat((zeros, boxes), dim=1)
poolers = [
ROIAlign(output_size=[7, 7], spatial_scale=ss, sampling_ratio=2) for ss in spatial_scales
]
pooled_feats = []
for feat_map, pooler in zip(feat_list, poolers):
pooled_feats.append(pooler(feat_map, rois))
avg_pool = nn.AdaptiveAvgPool2d((1, 1))
pooled_feats_flat = [avg_pool(pf).view(num_rois, -1) for pf in pooled_feats]
# Concatenate features from all levels
final_feats = torch.cat(pooled_feats_flat, dim=1)
return [final_feats], pooled_feats
def get_result_with_features_yolov10_simple(model, imgs, embed_layers, conf=0.25):
"""
Simplified approach: Use standard YOLO inference first, then extract features.
"""
if not isinstance(imgs, list):
imgs = [imgs]
# First, run standard inference to get proper Results objects
results = model(imgs, verbose=False, conf=conf)
# Then extract features for each detected object
for i, result in enumerate(results):
if hasattr(result, 'boxes') and len(result.boxes) > 0:
# Get the preprocessed image that was used for inference
prepped = model.predictor.preprocess([result.orig_img])
# --- Temporarily set the embed layers ---
# Save the previous setting so we can restore it afterwards. Leaving a non-None
# value in `model.predictor.args.embed` would cause the model to return raw
# feature maps (instead of standard detection outputs) on the *next* call,
# which results in missing detections for every image processed after the
# first one. Restoring the value here ensures normal behaviour for the
# following iterations.
prev_embed = getattr(model.predictor.args, "embed", None)
model.predictor.args.embed = embed_layers
# Call inference with embedding to get feature maps
features = model.predictor.inference(prepped)
# Restore previous embed setting
model.predictor.args.embed = prev_embed
# The feature maps are all but the last element of the result
feature_maps = features[:-1]
# Extract features for each detected box
boxes_scaled = result.boxes.xyxy
# Scale boxes to the preprocessed image size for feature extraction
boxes_for_features = scale_boxes(result.orig_img.shape, boxes_scaled.clone(), prepped.shape[2:])
# Create dummy indices (we're not using NMS indices here)
dummy_idxs = [torch.arange(len(boxes_for_features))]
# Get features
obj_feats, pooled_feats = get_yolov10_object_features_with_pooler(feature_maps, dummy_idxs, boxes_for_features, result.orig_img.shape)
# Add features to the result
result.feats = obj_feats[0] if obj_feats else torch.empty(0)
result.pooled_feats = pooled_feats
return results
def draw_debug_image(img, boxes, class_names, save_path="debug_detections.png", highlight_idx=None):
"""Draw bounding boxes on the original image for debugging."""
debug_img = img.copy()
for i, box in enumerate(boxes):
x1, y1, x2, y2 = box.cpu().numpy().astype(int)
# Clip coordinates to image bounds
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2)
# Highlight the selected object
color = (0, 0, 255) if i == highlight_idx else (0, 255, 0) # Red for selected, green for others
thickness = 3 if i == highlight_idx else 2
cv2.rectangle(debug_img, (x1, y1), (x2, y2), color, thickness)
cv2.putText(debug_img, f"{class_names[i]} #{i}", (x1, y1-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
cv2.imwrite(save_path, debug_img)
print(f"Debug image with bounding boxes saved to {save_path}")
return debug_img
def draw_feature_heatmap(image, box, feature_map):
"""
Draws a feature map as a heatmap on a specific region of an image.
"""
# Detach and move feature map to CPU
feature_map = feature_map.detach().cpu()
# Average features across channels to get a 2D heatmap
heatmap = torch.mean(feature_map, dim=0).numpy()
# Normalize heatmap to 0-255
if np.max(heatmap) > np.min(heatmap):
heatmap = (heatmap - np.min(heatmap)) / (np.max(heatmap) - np.min(heatmap))
heatmap = (heatmap * 255).astype(np.uint8)
# Get bounding box coordinates
x1, y1, x2, y2 = box.cpu().numpy().astype(int)
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(image.shape[1], x2), min(image.shape[0], y2)
bbox_w, bbox_h = x2 - x1, y2 - y1
if bbox_w <= 0 or bbox_h <= 0:
return image # return original image
# Resize heatmap to bounding box size
heatmap_resized = cv2.resize(heatmap, (bbox_w, bbox_h), interpolation=cv2.INTER_LINEAR)
# Apply colormap
heatmap_colored = cv2.applyColorMap(heatmap_resized, cv2.COLORMAP_JET)
# Get the region of interest from the original image
roi = image[y1:y2, x1:x2]
# Blend heatmap with ROI
overlay = cv2.addWeighted(roi, 0.6, heatmap_colored, 0.4, 0)
# Place the overlay back onto the image
output_image = image.copy()
output_image[y1:y2, x1:x2] = overlay
return output_image
def draw_filled_rounded_rectangle(img, pt1, pt2, color, radius):
"""Draws a filled rounded rectangle."""
x1, y1 = pt1
x2, y2 = pt2
# Draw circles at the corners
cv2.circle(img, (x1 + radius, y1 + radius), radius, color, -1)
cv2.circle(img, (x2 - radius, y1 + radius), radius, color, -1)
cv2.circle(img, (x1 + radius, y2 - radius), radius, color, -1)
cv2.circle(img, (x2 - radius, y2 - radius), radius, color, -1)
# Draw the central rectangles
cv2.rectangle(img, (x1 + radius, y1), (x2 - radius, y2), color, -1)
cv2.rectangle(img, (x1, y1 + radius), (x2, y2 - radius), color, -1)
def draw_modern_bbox(image, box, label, color):
"""Draws a modern-style bounding box with a semi-transparent, rounded label."""
x1, y1, x2, y2 = box.astype(int)
# Draw the main bounding box outline
cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=2)
# --- Label ---
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.5
font_thickness = 1
(text_w, text_h), _ = cv2.getTextSize(label, font, font_scale, font_thickness)
# Define label background position, handling top-of-image cases
label_bg_pt1 = (x1, y1 - text_h - 15)
label_bg_pt2 = (x1 + text_w + 10, y1)
if label_bg_pt1[1] < 0:
label_bg_pt1 = (x1, y1 + 5)
label_bg_pt2 = (x1 + text_w + 10, y1 + text_h + 20)
# Create an overlay for the semi-transparent background
overlay = image.copy()
# Draw the filled rounded rectangle on the overlay
draw_filled_rounded_rectangle(overlay, label_bg_pt1, label_bg_pt2, color, radius=8)
# Blend the overlay with the main image
alpha = 0.6
cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0, image)
# Define text position and draw it on the blended image
text_pt = (label_bg_pt1[0] + 5, label_bg_pt1[1] + text_h + 5)
cv2.putText(image, label, text_pt, font, font_scale, (0, 0, 0), font_thickness, cv2.LINE_AA)
def generate_feature_heatmaps(model, img_path, embed_layers, output_dir="./", conf=0.25):
"""
Generates a single composite image containing the main image with bounding boxes
and separate heatmap snippets for each detected object.
Args:
model: YOLOv10 model
img_path: Path to the input image
embed_layers: List of layer indices to extract features from
output_dir: Directory to save outputs
conf: Object detection confidence threshold
"""
# Load image
img = cv2.imread(img_path)
if img is None:
raise FileNotFoundError(f"Could not read image at {img_path}")
print(f"Processing image: {img_path}")
# Get results with features
results_with_feat = get_result_with_features_yolov10_simple(model, img_path, embed_layers, conf=conf)
if not results_with_feat or not isinstance(results_with_feat, list) or len(results_with_feat) == 0:
print("No results returned.")
return
result = results_with_feat[0]
if not hasattr(result, 'boxes') or len(result.boxes) == 0:
print("No objects detected in the image.")
return
num_objects = len(result.boxes)
print(f"Total objects detected: {num_objects}. Generating composite layout...")
# Get class names
all_class_names = [model.model.names[int(cls)] for cls in result.boxes.cls]
# --- Step 1: Create the main image with modern bounding boxes ---
main_image_with_boxes = img.copy()
colors = [(71, 224, 253), (159, 128, 255), (159, 227, 128), (255, 191, 0), (255, 165, 0), (255, 0, 255)]
for i in range(num_objects):
label = f"{all_class_names[i]} {result.boxes.conf[i]:.2f}"
color = colors[i % len(colors)]
draw_modern_bbox(main_image_with_boxes, result.boxes.xyxy[i].cpu().numpy(), label, color)
# --- Step 2: Generate individual heatmap snippets for each object ---
heatmap_snippets = []
if hasattr(result, 'pooled_feats') and result.pooled_feats:
last_layer_pooled_feats = result.pooled_feats[-1]
for i in range(num_objects):
box = result.boxes.xyxy[i]
feature_map = last_layer_pooled_feats[i]
heatmap_on_full = draw_feature_heatmap(img.copy(), box, feature_map)
x1, y1, x2, y2 = box.cpu().numpy().astype(int)
snippet = heatmap_on_full[y1:y2, x1:x2]
label_text = f"Obj #{i}: {all_class_names[i]}"
font = cv2.FONT_HERSHEY_SIMPLEX
(text_w, text_h), _ = cv2.getTextSize(label_text, font, 0.6, 1)
h, w, _ = snippet.shape
# Make the snippet canvas wide enough for the text label
new_w = max(w, text_w + 10)
snippet_with_label = np.full((h + text_h + 15, new_w, 3), 255, dtype=np.uint8)
# Paste the snippet (centered) onto the new canvas
paste_x = (new_w - w) // 2
snippet_with_label[0:h, paste_x:paste_x+w] = snippet
# Draw the label text (centered)
text_x = (new_w - text_w) // 2
cv2.putText(snippet_with_label, label_text, (text_x, h + text_h + 5), font, 0.6, (0,0,0), 1, cv2.LINE_AA)
cv2.rectangle(snippet_with_label, (0,0), (new_w-1, h+text_h+14), (180,180,180), 1)
heatmap_snippets.append(snippet_with_label)
if not heatmap_snippets:
print("No heatmaps generated. Saving image with bounding boxes only.")
image_name = Path(img_path).stem
save_path = Path(output_dir) / f"{image_name}_layout.png"
cv2.imwrite(str(save_path), main_image_with_boxes)
return
# --- Step 3: Arrange snippets and main image into a final composite image ---
main_h, main_w, _ = main_image_with_boxes.shape
padding = 20
# Arrange snippets into a horizontal row
snippets_row_h = max(s.shape[0] for s in heatmap_snippets)
total_snippets_w = sum(s.shape[1] for s in heatmap_snippets) + (len(heatmap_snippets) - 1) * 10
snippets_row = np.full((snippets_row_h, total_snippets_w, 3), 255, dtype=np.uint8)
current_x = 0
for snippet in heatmap_snippets:
h, w, _ = snippet.shape
paste_y = (snippets_row_h - h) // 2
snippets_row[paste_y:paste_y+h, current_x:current_x+w] = snippet
current_x += w + 10
# Create the final canvas and place the main image and the snippet row
canvas_h = main_h + snippets_row_h + 3 * padding
canvas_w = max(main_w, total_snippets_w) + 2 * padding
final_image = np.full((canvas_h, canvas_w, 3), 255, dtype=np.uint8)
# Paste main image at top-center
x_offset_main = (canvas_w - main_w) // 2
final_image[padding:padding+main_h, x_offset_main:x_offset_main+main_w] = main_image_with_boxes
# Paste snippet row at bottom-center
x_offset_snippets = (canvas_w - total_snippets_w) // 2
y_offset_snippets = main_h + 2 * padding
final_image[y_offset_snippets:y_offset_snippets+snippets_row_h, x_offset_snippets:x_offset_snippets+total_snippets_w] = snippets_row
# --- Step 4: Save the final composite image ---
image_name = Path(img_path).stem
heatmap_path = Path(output_dir) / f"{image_name}_heatmap_layout.png"
cv2.imwrite(str(heatmap_path), final_image)
print(f" - Saved composite heatmap layout to: {heatmap_path}")
def main():
parser = argparse.ArgumentParser(description='Generate a composite feature heatmap for all detected objects in an image or a directory of images.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--image', '-i', type=str, help='Path to a single input image.')
group.add_argument('--input-dir', '-d', type=str, help='Path to a directory of input images.')
parser.add_argument('--model', '-m', type=str, default='yolov10n.pt', help='Path to YOLOv10 model')
parser.add_argument('--output', '-o', type=str, default='./heatmaps', help='Output directory for generated layouts.')
parser.add_argument('--conf', type=float, default=0.25, help='Object detection confidence threshold (e.g., 0.1 for more detections).')
args = parser.parse_args()
# Create output directory if it doesn't exist
Path(args.output).mkdir(parents=True, exist_ok=True)
# Load YOLOv10 model
print(f"Loading model: {args.model}")
model = YOLO(args.model)
# Monkey patch the model's prediction method
model.model._predict_once = MethodType(_predict_once, model.model)
# Initialize the predictor by running a dummy inference
model(np.zeros((640, 640, 3)), verbose=False)
# Dynamically find the feature map layer indices from the model
detect_layer_index = -1
for i, m in enumerate(model.model.model):
if 'Detect' in type(m).__name__:
detect_layer_index = i
break
if detect_layer_index != -1:
input_layers_indices = model.model.model[detect_layer_index].f
embed_layers = sorted(input_layers_indices) + [detect_layer_index]
print(f"Auto-detected feature layers at indices: {input_layers_indices}")
print(f"Embedding features from layers: {embed_layers}")
else:
print("Could not find Detect layer, falling back to hardcoded indices")
embed_layers = [16, 19, 22, 23]
# Process either a single image or a directory of images
if args.input_dir:
input_path = Path(args.input_dir)
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tif', '*.tiff']
image_files = []
for ext in image_extensions:
image_files.extend(input_path.glob(ext))
if not image_files:
print(f"No images found in '{args.input_dir}'.")
return
print(f"\nFound {len(image_files)} images in '{args.input_dir}'. Processing...")
for img_path in image_files:
generate_feature_heatmaps(
model=model,
img_path=str(img_path),
embed_layers=embed_layers,
output_dir=args.output,
conf=args.conf
)
else: # if args.image
generate_feature_heatmaps(
model=model,
img_path=args.image,
embed_layers=embed_layers,
output_dir=args.output,
conf=args.conf
)
print(f"\nProcessing complete. All layouts saved to '{args.output}'.")
if __name__ == "__main__":
# If run without arguments, use test image
import sys
if len(sys.argv) == 1:
print("No arguments provided. Running heatmap generation on a test image.")
# Load YOLOv10 model
print("Loading default model: yolov10n.pt")
model = YOLO('yolov10n.pt')
model.model._predict_once = MethodType(_predict_once, model.model)
model(np.zeros((640, 640, 3)), verbose=False)
# Auto-detect layers
detect_layer_index = -1
for i, m in enumerate(model.model.model):
if 'Detect' in type(m).__name__:
detect_layer_index = i
break
if detect_layer_index != -1:
input_layers_indices = model.model.model[detect_layer_index].f
embed_layers = sorted(input_layers_indices) + [detect_layer_index]
print(f"Auto-detected feature layers at indices: {input_layers_indices}")
else:
embed_layers = [16, 19, 22, 23]
# Define test image path
img_path = "/home/hew/yolov10FX_obj/id-1.jpg"
# Generate heatmaps for the test image
print("Using a lower confidence of 0.1 for test mode to find more objects.")
generate_feature_heatmaps(
model=model,
img_path=img_path,
embed_layers=embed_layers,
output_dir="./",
conf=0.1
)
print(f"\nHeatmap generation completed successfully for test image!")
else:
main()