Spaces:

HugoHE
/

X-YOLOv10

Sleeping

App Files Files Community

X-YOLOv10 / yolov10_RoIFX.py

HugoHE

Initial commit with code files

1d08579 6 months ago

raw

history blame contribute delete

19.7 kB

	# -- coding: utf-8 --
	"""
	YOLOv10 Single Object Feature Extractor

	This script extracts features for a specific detected object by its index.
	It can be used to build feature databases or for targeted object analysis.
	"""

	from ultralytics import YOLO
	from ultralytics.utils.ops import xywh2xyxy, scale_boxes
	from ultralytics.engine.results import Results
	import torch
	import time
	from torch.nn.functional import cosine_similarity
	import cv2
	import matplotlib.pyplot as plt
	import numpy as np
	from pathlib import Path
	import urllib.request
	import argparse
	import json

	from torchvision.ops import RoIAlign as ROIAlign
	import torch.nn as nn
	import torch.nn.functional as F

	from types import MethodType
	import torchvision
	import collections


	# Monkey patch method to get feature maps
	def _predict_once(self, x, profile=False, visualize=False, embed=None):
	y, dt, embeddings = [], [], [] # outputs
	for m in self.model:
	if m.f != -1: # if not from previous layer
	x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
	if profile:
	self._profile_one_layer(m, x, dt)
	x = m(x) # run
	y.append(x if m.i in self.save else None) # save output
	if visualize:
	feature_visualization(x, m.type, m.i, save_dir=visualize)

	if embed and m.i in embed:
	embeddings.append(x)
	if m.i == max(embed):
	return embeddings
	return x


	def get_yolov10_object_features_with_pooler(feat_list, idxs, boxes, orig_img_shape):
	"""
	Extracts object features from YOLOv10 feature maps using RoIAlign.
	Concatenates features from all levels for each detected object.
	"""
	# Assuming input image is resized to 640x640
	img_size = 640

	# We need to know the downsampling ratio for each feature map
	# P3 has stride 8, P4 has stride 16, P5 has stride 32
	spatial_scales = [1.0 / 8, 1.0 / 16, 1.0 / 32]

	num_rois = len(boxes)
	if num_rois == 0:
	return [torch.empty(0)], []

	# Add batch index 0 to boxes for ROIAlign
	zeros = torch.full((num_rois, 1), 0, device=boxes.device, dtype=boxes.dtype)
	rois = torch.cat((zeros, boxes), dim=1)

	poolers = [
	ROIAlign(output_size=[7, 7], spatial_scale=ss, sampling_ratio=2) for ss in spatial_scales
	]

	pooled_feats = []
	for feat_map, pooler in zip(feat_list, poolers):
	pooled_feats.append(pooler(feat_map, rois))

	avg_pool = nn.AdaptiveAvgPool2d((1, 1))

	pooled_feats_flat = [avg_pool(pf).view(num_rois, -1) for pf in pooled_feats]

	# Concatenate features from all levels
	final_feats = torch.cat(pooled_feats_flat, dim=1)

	return [final_feats], pooled_feats


	def get_result_with_features_yolov10_simple(model, imgs, embed_layers, conf=0.25):
	"""
	Simplified approach: Use standard YOLO inference first, then extract features.
	"""
	if not isinstance(imgs, list):
	imgs = [imgs]

	# First, run standard inference to get proper Results objects
	results = model(imgs, verbose=False, conf=conf)

	# Then extract features for each detected object
	for i, result in enumerate(results):
	if hasattr(result, 'boxes') and len(result.boxes) > 0:
	# Get the preprocessed image that was used for inference
	prepped = model.predictor.preprocess([result.orig_img])

	# --- Temporarily set the embed layers ---
	# Save the previous setting so we can restore it afterwards. Leaving a non-None
	# value in `model.predictor.args.embed` would cause the model to return raw
	# feature maps (instead of standard detection outputs) on the next call,
	# which results in missing detections for every image processed after the
	# first one. Restoring the value here ensures normal behaviour for the
	# following iterations.
	prev_embed = getattr(model.predictor.args, "embed", None)
	model.predictor.args.embed = embed_layers

	# Call inference with embedding to get feature maps
	features = model.predictor.inference(prepped)

	# Restore previous embed setting
	model.predictor.args.embed = prev_embed

	# The feature maps are all but the last element of the result
	feature_maps = features[:-1]

	# Extract features for each detected box
	boxes_scaled = result.boxes.xyxy
	# Scale boxes to the preprocessed image size for feature extraction
	boxes_for_features = scale_boxes(result.orig_img.shape, boxes_scaled.clone(), prepped.shape[2:])

	# Create dummy indices (we're not using NMS indices here)
	dummy_idxs = [torch.arange(len(boxes_for_features))]

	# Get features
	obj_feats, pooled_feats = get_yolov10_object_features_with_pooler(feature_maps, dummy_idxs, boxes_for_features, result.orig_img.shape)

	# Add features to the result
	result.feats = obj_feats[0] if obj_feats else torch.empty(0)
	result.pooled_feats = pooled_feats

	return results


	def draw_debug_image(img, boxes, class_names, save_path="debug_detections.png", highlight_idx=None):
	"""Draw bounding boxes on the original image for debugging."""
	debug_img = img.copy()
	for i, box in enumerate(boxes):
	x1, y1, x2, y2 = box.cpu().numpy().astype(int)
	# Clip coordinates to image bounds
	x1, y1 = max(0, x1), max(0, y1)
	x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2)

	# Highlight the selected object
	color = (0, 0, 255) if i == highlight_idx else (0, 255, 0) # Red for selected, green for others
	thickness = 3 if i == highlight_idx else 2

	cv2.rectangle(debug_img, (x1, y1), (x2, y2), color, thickness)
	cv2.putText(debug_img, f"{class_names[i]} #{i}", (x1, y1-10),
	cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)

	cv2.imwrite(save_path, debug_img)
	print(f"Debug image with bounding boxes saved to {save_path}")
	return debug_img


	def draw_feature_heatmap(image, box, feature_map):
	"""
	Draws a feature map as a heatmap on a specific region of an image.
	"""
	# Detach and move feature map to CPU
	feature_map = feature_map.detach().cpu()

	# Average features across channels to get a 2D heatmap
	heatmap = torch.mean(feature_map, dim=0).numpy()

	# Normalize heatmap to 0-255
	if np.max(heatmap) > np.min(heatmap):
	heatmap = (heatmap - np.min(heatmap)) / (np.max(heatmap) - np.min(heatmap))
	heatmap = (heatmap * 255).astype(np.uint8)

	# Get bounding box coordinates
	x1, y1, x2, y2 = box.cpu().numpy().astype(int)
	x1, y1 = max(0, x1), max(0, y1)
	x2, y2 = min(image.shape[1], x2), min(image.shape[0], y2)

	bbox_w, bbox_h = x2 - x1, y2 - y1
	if bbox_w <= 0 or bbox_h <= 0:
	return image # return original image

	# Resize heatmap to bounding box size
	heatmap_resized = cv2.resize(heatmap, (bbox_w, bbox_h), interpolation=cv2.INTER_LINEAR)

	# Apply colormap
	heatmap_colored = cv2.applyColorMap(heatmap_resized, cv2.COLORMAP_JET)

	# Get the region of interest from the original image
	roi = image[y1:y2, x1:x2]

	# Blend heatmap with ROI
	overlay = cv2.addWeighted(roi, 0.6, heatmap_colored, 0.4, 0)

	# Place the overlay back onto the image
	output_image = image.copy()
	output_image[y1:y2, x1:x2] = overlay

	return output_image


	def draw_filled_rounded_rectangle(img, pt1, pt2, color, radius):
	"""Draws a filled rounded rectangle."""
	x1, y1 = pt1
	x2, y2 = pt2

	# Draw circles at the corners
	cv2.circle(img, (x1 + radius, y1 + radius), radius, color, -1)
	cv2.circle(img, (x2 - radius, y1 + radius), radius, color, -1)
	cv2.circle(img, (x1 + radius, y2 - radius), radius, color, -1)
	cv2.circle(img, (x2 - radius, y2 - radius), radius, color, -1)

	# Draw the central rectangles
	cv2.rectangle(img, (x1 + radius, y1), (x2 - radius, y2), color, -1)
	cv2.rectangle(img, (x1, y1 + radius), (x2, y2 - radius), color, -1)


	def draw_modern_bbox(image, box, label, color):
	"""Draws a modern-style bounding box with a semi-transparent, rounded label."""
	x1, y1, x2, y2 = box.astype(int)

	# Draw the main bounding box outline
	cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness=2)

	# --- Label ---
	font = cv2.FONT_HERSHEY_SIMPLEX
	font_scale = 0.5
	font_thickness = 1
	(text_w, text_h), _ = cv2.getTextSize(label, font, font_scale, font_thickness)

	# Define label background position, handling top-of-image cases
	label_bg_pt1 = (x1, y1 - text_h - 15)
	label_bg_pt2 = (x1 + text_w + 10, y1)
	if label_bg_pt1[1] < 0:
	label_bg_pt1 = (x1, y1 + 5)
	label_bg_pt2 = (x1 + text_w + 10, y1 + text_h + 20)

	# Create an overlay for the semi-transparent background
	overlay = image.copy()

	# Draw the filled rounded rectangle on the overlay
	draw_filled_rounded_rectangle(overlay, label_bg_pt1, label_bg_pt2, color, radius=8)

	# Blend the overlay with the main image
	alpha = 0.6
	cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0, image)

	# Define text position and draw it on the blended image
	text_pt = (label_bg_pt1[0] + 5, label_bg_pt1[1] + text_h + 5)
	cv2.putText(image, label, text_pt, font, font_scale, (0, 0, 0), font_thickness, cv2.LINE_AA)


	def generate_feature_heatmaps(model, img_path, embed_layers, output_dir="./", conf=0.25):
	"""
	Generates a single composite image containing the main image with bounding boxes
	and separate heatmap snippets for each detected object.

	Args:
	model: YOLOv10 model
	img_path: Path to the input image
	embed_layers: List of layer indices to extract features from
	output_dir: Directory to save outputs
	conf: Object detection confidence threshold
	"""

	# Load image
	img = cv2.imread(img_path)
	if img is None:
	raise FileNotFoundError(f"Could not read image at {img_path}")

	print(f"Processing image: {img_path}")

	# Get results with features
	results_with_feat = get_result_with_features_yolov10_simple(model, img_path, embed_layers, conf=conf)

	if not results_with_feat or not isinstance(results_with_feat, list) or len(results_with_feat) == 0:
	print("No results returned.")
	return

	result = results_with_feat[0]
	if not hasattr(result, 'boxes') or len(result.boxes) == 0:
	print("No objects detected in the image.")
	return

	num_objects = len(result.boxes)
	print(f"Total objects detected: {num_objects}. Generating composite layout...")

	# Get class names
	all_class_names = [model.model.names[int(cls)] for cls in result.boxes.cls]

	# --- Step 1: Create the main image with modern bounding boxes ---
	main_image_with_boxes = img.copy()
	colors = [(71, 224, 253), (159, 128, 255), (159, 227, 128), (255, 191, 0), (255, 165, 0), (255, 0, 255)]
	for i in range(num_objects):
	label = f"{all_class_names[i]} {result.boxes.conf[i]:.2f}"
	color = colors[i % len(colors)]
	draw_modern_bbox(main_image_with_boxes, result.boxes.xyxy[i].cpu().numpy(), label, color)

	# --- Step 2: Generate individual heatmap snippets for each object ---
	heatmap_snippets = []
	if hasattr(result, 'pooled_feats') and result.pooled_feats:
	last_layer_pooled_feats = result.pooled_feats[-1]
	for i in range(num_objects):
	box = result.boxes.xyxy[i]
	feature_map = last_layer_pooled_feats[i]

	heatmap_on_full = draw_feature_heatmap(img.copy(), box, feature_map)
	x1, y1, x2, y2 = box.cpu().numpy().astype(int)
	snippet = heatmap_on_full[y1:y2, x1:x2]

	label_text = f"Obj #{i}: {all_class_names[i]}"
	font = cv2.FONT_HERSHEY_SIMPLEX
	(text_w, text_h), _ = cv2.getTextSize(label_text, font, 0.6, 1)

	h, w, _ = snippet.shape

	# Make the snippet canvas wide enough for the text label
	new_w = max(w, text_w + 10)
	snippet_with_label = np.full((h + text_h + 15, new_w, 3), 255, dtype=np.uint8)

	# Paste the snippet (centered) onto the new canvas
	paste_x = (new_w - w) // 2
	snippet_with_label[0:h, paste_x:paste_x+w] = snippet

	# Draw the label text (centered)
	text_x = (new_w - text_w) // 2
	cv2.putText(snippet_with_label, label_text, (text_x, h + text_h + 5), font, 0.6, (0,0,0), 1, cv2.LINE_AA)
	cv2.rectangle(snippet_with_label, (0,0), (new_w-1, h+text_h+14), (180,180,180), 1)
	heatmap_snippets.append(snippet_with_label)

	if not heatmap_snippets:
	print("No heatmaps generated. Saving image with bounding boxes only.")
	image_name = Path(img_path).stem
	save_path = Path(output_dir) / f"{image_name}_layout.png"
	cv2.imwrite(str(save_path), main_image_with_boxes)
	return

	# --- Step 3: Arrange snippets and main image into a final composite image ---
	main_h, main_w, _ = main_image_with_boxes.shape
	padding = 20

	# Arrange snippets into a horizontal row
	snippets_row_h = max(s.shape[0] for s in heatmap_snippets)
	total_snippets_w = sum(s.shape[1] for s in heatmap_snippets) + (len(heatmap_snippets) - 1) * 10

	snippets_row = np.full((snippets_row_h, total_snippets_w, 3), 255, dtype=np.uint8)
	current_x = 0
	for snippet in heatmap_snippets:
	h, w, _ = snippet.shape
	paste_y = (snippets_row_h - h) // 2
	snippets_row[paste_y:paste_y+h, current_x:current_x+w] = snippet
	current_x += w + 10

	# Create the final canvas and place the main image and the snippet row
	canvas_h = main_h + snippets_row_h + 3 * padding
	canvas_w = max(main_w, total_snippets_w) + 2 * padding
	final_image = np.full((canvas_h, canvas_w, 3), 255, dtype=np.uint8)

	# Paste main image at top-center
	x_offset_main = (canvas_w - main_w) // 2
	final_image[padding:padding+main_h, x_offset_main:x_offset_main+main_w] = main_image_with_boxes

	# Paste snippet row at bottom-center
	x_offset_snippets = (canvas_w - total_snippets_w) // 2
	y_offset_snippets = main_h + 2 * padding
	final_image[y_offset_snippets:y_offset_snippets+snippets_row_h, x_offset_snippets:x_offset_snippets+total_snippets_w] = snippets_row

	# --- Step 4: Save the final composite image ---
	image_name = Path(img_path).stem
	heatmap_path = Path(output_dir) / f"{image_name}_heatmap_layout.png"
	cv2.imwrite(str(heatmap_path), final_image)
	print(f" - Saved composite heatmap layout to: {heatmap_path}")


	def main():
	parser = argparse.ArgumentParser(description='Generate a composite feature heatmap for all detected objects in an image or a directory of images.')
	group = parser.add_mutually_exclusive_group(required=True)
	group.add_argument('--image', '-i', type=str, help='Path to a single input image.')
	group.add_argument('--input-dir', '-d', type=str, help='Path to a directory of input images.')

	parser.add_argument('--model', '-m', type=str, default='yolov10n.pt', help='Path to YOLOv10 model')
	parser.add_argument('--output', '-o', type=str, default='./heatmaps', help='Output directory for generated layouts.')
	parser.add_argument('--conf', type=float, default=0.25, help='Object detection confidence threshold (e.g., 0.1 for more detections).')

	args = parser.parse_args()

	# Create output directory if it doesn't exist
	Path(args.output).mkdir(parents=True, exist_ok=True)

	# Load YOLOv10 model
	print(f"Loading model: {args.model}")
	model = YOLO(args.model)

	# Monkey patch the model's prediction method
	model.model._predict_once = MethodType(_predict_once, model.model)

	# Initialize the predictor by running a dummy inference
	model(np.zeros((640, 640, 3)), verbose=False)

	# Dynamically find the feature map layer indices from the model
	detect_layer_index = -1
	for i, m in enumerate(model.model.model):
	if 'Detect' in type(m).__name__:
	detect_layer_index = i
	break

	if detect_layer_index != -1:
	input_layers_indices = model.model.model[detect_layer_index].f
	embed_layers = sorted(input_layers_indices) + [detect_layer_index]
	print(f"Auto-detected feature layers at indices: {input_layers_indices}")
	print(f"Embedding features from layers: {embed_layers}")
	else:
	print("Could not find Detect layer, falling back to hardcoded indices")
	embed_layers = [16, 19, 22, 23]

	# Process either a single image or a directory of images
	if args.input_dir:
	input_path = Path(args.input_dir)
	image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']
	image_files = []
	for ext in image_extensions:
	image_files.extend(input_path.glob(ext))

	if not image_files:
	print(f"No images found in '{args.input_dir}'.")
	return

	print(f"\nFound {len(image_files)} images in '{args.input_dir}'. Processing...")
	for img_path in image_files:
	generate_feature_heatmaps(
	model=model,
	img_path=str(img_path),
	embed_layers=embed_layers,
	output_dir=args.output,
	conf=args.conf
	)
	else: # if args.image
	generate_feature_heatmaps(
	model=model,
	img_path=args.image,
	embed_layers=embed_layers,
	output_dir=args.output,
	conf=args.conf
	)

	print(f"\nProcessing complete. All layouts saved to '{args.output}'.")


	if __name__ == "__main__":
	# If run without arguments, use test image
	import sys
	if len(sys.argv) == 1:
	print("No arguments provided. Running heatmap generation on a test image.")

	# Load YOLOv10 model
	print("Loading default model: yolov10n.pt")
	model = YOLO('yolov10n.pt')
	model.model._predict_once = MethodType(_predict_once, model.model)
	model(np.zeros((640, 640, 3)), verbose=False)

	# Auto-detect layers
	detect_layer_index = -1
	for i, m in enumerate(model.model.model):
	if 'Detect' in type(m).__name__:
	detect_layer_index = i
	break

	if detect_layer_index != -1:
	input_layers_indices = model.model.model[detect_layer_index].f
	embed_layers = sorted(input_layers_indices) + [detect_layer_index]
	print(f"Auto-detected feature layers at indices: {input_layers_indices}")
	else:
	embed_layers = [16, 19, 22, 23]

	# Define test image path
	img_path = "/home/hew/yolov10FX_obj/id-1.jpg"

	# Generate heatmaps for the test image
	print("Using a lower confidence of 0.1 for test mode to find more objects.")
	generate_feature_heatmaps(
	model=model,
	img_path=img_path,
	embed_layers=embed_layers,
	output_dir="./",
	conf=0.1
	)
	print(f"\nHeatmap generation completed successfully for test image!")

	else:
	main()