Video Object Detection with YOLO Models

import gradio as gr
import cv2
import numpy as np
from ultralytics import YOLO
import os
import tempfile
from moviepy.editor import ImageSequenceClip
from PIL import Image

# Load both YOLO models
model_yolo11 = YOLO('./data/yolo11n.pt')
model_best = YOLO('./data/best.pt')

def process_video(video_path, model_name, conf_threshold=0.4):
    """
    Process the input video frame by frame using the selected YOLO model,
    draw bounding boxes, and return the processed video path.
    """
    # Select model based on user input
    model = model_yolo11 if model_name == "YOLO11n" else model_best

    # Open video capture
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Could not open video file")

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Store processed frames
    processed_frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Perform object detection
        results = model.predict(
            source=frame,
            conf=conf_threshold,
            imgsz=640,
            show_labels=True,
            show_conf=True
        )

        # Draw bounding boxes on the frame
        for result in results:
            im_array = result.plot()  # Plot bounding boxes
            processed_frames.append(im_array[..., ::-1])  # Convert BGR to RGB

    cap.release()

    # Save processed frames to a temporary video file
    temp_video_path = os.path.join(tempfile.gettempdir(), "output.mp4")
    clip = ImageSequenceClip(processed_frames, fps=fps)
    clip.write_videofile(temp_video_path, codec='libx264')

    return temp_video_path

# Define Gradio interface
with gr.Blocks() as app:
    gr.HTML("""
        <h1 style='text-align: center'>
            Video Object Detection with YOLO Models
        </h1>
    """)
    
    with gr.Row():
        with gr.Column():
            video_input = gr.Video(label="Upload Video")
            model_choice = gr.Dropdown(
                choices=["YOLO11n", "Best Model"],
                label="Select Model",
                value="YOLO11n"
            )
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.4
            )
            process_button = gr.Button("Process Video")
        
        with gr.Column():
            video_output = gr.Video(
                label="Processed Video",
                streaming=True,
                autoplay=True
            )

    process_button.click(
        fn=process_video,
        inputs=[video_input, model_choice, conf_threshold],
        outputs=[video_output]
    )

if __name__ == "__main__":
    app.launch()