import torch import numpy as np import gradio as gr import cv2 import time import os from pathlib import Path # Create cache directory for models if it doesn't exist os.makedirs("models", exist_ok=True) # Check device availability - Hugging Face Spaces often provides GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Load YOLOv5x model with caching for faster startup model_path = Path("models/yolov5x.pt") if model_path.exists(): print(f"Loading model from cache: {model_path}") model = torch.hub.load("ultralytics/yolov5", "yolov5x", pretrained=True, source="local", path=str(model_path)).to(device) else: print("Downloading YOLOv5x model and caching...") model = torch.hub.load("ultralytics/yolov5", "yolov5x", pretrained=True).to(device) # Cache the model for faster startup next time torch.save(model.state_dict(), model_path) # Optimization configurations model.conf = 0.3 # Confidence threshold of 0.3 as specified model.iou = 0.3 # NMS IoU threshold of 0.3 as specified model.classes = None # Detect all 80+ COCO classes # Optimize for GPU if available if device.type == "cuda": # Use mixed precision for performance boost model.half() else: # On CPU, optimize operations torch.set_num_threads(os.cpu_count()) # Set model to evaluation mode for inference model.eval() # Assign fixed colors to each class for consistent visualization np.random.seed(42) # For reproducible colors colors = np.random.uniform(0, 255, size=(len(model.names), 3)) # Track performance metrics total_inference_time = 0 inference_count = 0 def detect_objects(image): """ Process input image for object detection using YOLOv5 Args: image: Input image as numpy array Returns: output_image: Image with detection results visualized """ global total_inference_time, inference_count if image is None: return None start_time = time.time() # Create a copy for drawing results output_image = image.copy() # Fixed input size for optimal processing input_size = 640 # Perform inference with no gradient calculation with torch.no_grad(): # Convert image to tensor for faster processing results = model(image, size=input_size) # Record inference time (model processing only) inference_time = time.time() - start_time total_inference_time += inference_time inference_count += 1 avg_inference_time = total_inference_time / inference_count # Extract detections from first (and only) image detections = results.pred[0].cpu().numpy() # Draw each detection on the output image for *xyxy, conf, cls in detections: # Extract coordinates and convert to integers x1, y1, x2, y2 = map(int, xyxy) class_id = int(cls) # Get color for this class color = colors[class_id].tolist() cv2.rectangle(output_image, (x1, y1), (x2, y2), color, 4) # Create label with class name and confidence score label = f"{model.names[class_id]} {conf:.2f}" font_scale = 0.8 font_thickness = 2 (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness) cv2.rectangle(output_image, (x1, y1 - h - 10), (x1 + w + 10, y1), color, -1) cv2.putText(output_image, label, (x1 + 5, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 0), font_thickness + 1) cv2.putText(output_image, label, (x1 + 5, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), font_thickness) # Calculate FPS fps = 1 / inference_time fps_overlay = output_image.copy() cv2.rectangle(fps_overlay, (5, 5), (250, 80), (0, 0, 0), -1) # Apply the overlay with transparency alpha = 0.7 output_image = cv2.addWeighted(fps_overlay, alpha, output_image, 1 - alpha, 0) # Display FPS with larger font cv2.putText(output_image, f"FPS: {fps:.2f}", (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) cv2.putText(output_image, f"Avg FPS: {1/avg_inference_time:.2f}", (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) return output_image # Define example images - these will be stored in the same directory as this script example_images = [ "examples/spring_street_after.jpg", "examples/pexels-hikaique-109919.jpg" ] # Make sure example directory exists os.makedirs("examples", exist_ok=True) # Create Gradio interface - optimized for Hugging Face Spaces with gr.Blocks(title="Optimized YOLOv5 Object Detection") as demo: gr.Markdown(""" # Optimized YOLOv5 Object Detection This system utilizes YOLOv5 to detect 80+ object types from the COCO dataset. **Performance Features:** - Processing speed: Optimized for 30+ FPS at 640x640 resolution - Confidence threshold: 0.3 - IoU threshold: 0.3 Simply upload an image and click Submit to see the detections! """) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(label="Input Image", type="numpy") with gr.Row(): submit_button = gr.Button("Submit", variant="primary") clear_button = gr.Button("Clear") with gr.Column(scale=1): output_image = gr.Image(label="Detected Objects", type="numpy") # Example gallery gr.Examples( examples=example_images, inputs=input_image, outputs=output_image, fn=detect_objects, cache_examples=True # Cache for faster response ) # Set up button event handlers submit_button.click(fn=detect_objects, inputs=input_image, outputs=output_image) clear_button.click(lambda: None, None, [input_image, output_image]) # Launch for Hugging Face Spaces demo.launch()