Video Object Detection with YOLO Models


import os
os.environ['YOLO_CONFIG_DIR'] = '/tmp/Ultralytics'  # Set Ultralytics config path
import gradio as gr
import cv2
import numpy as np
from ultralytics import YOLO
import tempfile
from moviepy.editor import ImageSequenceClip
from PIL import Image

# Load both YOLO models
model_yolo11 = YOLO('./data/yolo11n.pt')
model_best = YOLO('./data/best.pt')

def process_video(video_path, model_name, conf_threshold=0.4):
    """
    Process the input video frame by frame using the selected YOLO model,
    draw bounding boxes, and return the processed video path.
    """
    # Select model to use
    model = model_yolo11 if model_name == "YOLO11n" else model_best

    # Open video capture
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Could not open video file")

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Store processed frames
    processed_frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Perform detection
        results = model.predict(
            source=frame,
            conf=conf_threshold,
            imgsz=640,
            show_labels=True,
            show_conf=True
        )

        # Draw bounding boxes
        for result in results:
            im_array = result.plot() # Plot boxes
            processed_frames.append(im_array[..., ::-1]) # Convert BGR to RGB

    cap.release()

    # Save processed frames to temp video
    temp_video_path = os.path.join(tempfile.gettempdir(), "output.mp4")
    clip = ImageSequenceClip(processed_frames, fps=fps)
    clip.write_videofile(temp_video_path, codec='libx264')

    return temp_video_path

# Gradio interface
with gr.Blocks() as app:
    gr.HTML("""
        <h1 style='text-align: center'>
            Video Object Detection with YOLO Models
        </h1>
    """)
    
    with gr.Row():
        with gr.Column():
            video_input = gr.Video(label="Upload Video")
            model_choice = gr.Dropdown(
                choices=["YOLO11n", "Best Model"],
                label="Select Model",
                value="YOLO11n"
            )
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.4
            )
            process_button = gr.Button("Process Video")
        
        with gr.Column():
            video_output = gr.Video(
                label="Processed Video",
                streaming=True,
                autoplay=True
            )

    process_button.click(
        fn=process_video,
        inputs=[video_input, model_choice, conf_threshold],
        outputs=[video_output]
    )

if __name__ == "__main__":
    app.launch()