Spaces:

gradio
/

yolov10_webcam_stream_main

Runtime error

App Files Files Community

freddyaboulton HF Staff commited on Oct 9, 2024

Commit

9bd282f

verified ·

1 Parent(s): 88eaa72

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +8 -8
inference.py +148 -0
requirements.txt +8 -0
run.ipynb +1 -0
run.py +72 -0
utils.py +237 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: Yolov10 Webcam Stream Main
-emoji: 👁
-colorFrom: pink
-colorTo: blue
 sdk: gradio
-sdk_version: 4.44.1
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: yolov10_webcam_stream_main
+emoji: 🔥
+colorFrom: indigo
+colorTo: indigo
 sdk: gradio
+sdk_version: 5.0.0
+app_file: run.py
 pinned: false
+hf_oauth: true
 ---

inference.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import time
+import cv2
+import numpy as np
+import onnxruntime  # type: ignore
+from utils import draw_detections  # type: ignore
+class YOLOv10:
+    def __init__(self, path):
+        # Initialize model
+        self.initialize_model(path)
+    def __call__(self, image):
+        return self.detect_objects(image)
+    def initialize_model(self, path):
+        self.session = onnxruntime.InferenceSession(
+            path, providers=onnxruntime.get_available_providers()
+        )
+        # Get model info
+        self.get_input_details()
+        self.get_output_details()
+    def detect_objects(self, image, conf_threshold=0.3):
+        input_tensor = self.prepare_input(image)
+        # Perform inference on the image
+        new_image = self.inference(image, input_tensor, conf_threshold)
+        return new_image
+    def prepare_input(self, image):
+        self.img_height, self.img_width = image.shape[:2]
+        input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Resize input image
+        input_img = cv2.resize(input_img, (self.input_width, self.input_height))
+        # Scale input pixel values to 0 to 1
+        input_img = input_img / 255.0
+        input_img = input_img.transpose(2, 0, 1)
+        input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
+        return input_tensor
+    def inference(self, image, input_tensor, conf_threshold=0.3):
+        start = time.perf_counter()
+        outputs = self.session.run(
+            self.output_names, {self.input_names[0]: input_tensor}
+        )
+        print(f"Inference time: {(time.perf_counter() - start)*1000:.2f} ms")
+        (
+            boxes,
+            scores,
+            class_ids,
+        ) = self.process_output(outputs, conf_threshold)
+        return self.draw_detections(image, boxes, scores, class_ids)
+    def process_output(self, output, conf_threshold=0.3):
+        predictions = np.squeeze(output[0])
+        # Filter out object confidence scores below threshold
+        scores = predictions[:, 4]
+        predictions = predictions[scores > conf_threshold, :]
+        scores = scores[scores > conf_threshold]
+        if len(scores) == 0:
+            return [], [], []
+        # Get the class with the highest confidence
+        class_ids = predictions[:, 5].astype(int)
+        # Get bounding boxes for each object
+        boxes = self.extract_boxes(predictions)
+        return boxes, scores, class_ids
+    def extract_boxes(self, predictions):
+        # Extract boxes from predictions
+        boxes = predictions[:, :4]
+        # Scale boxes to original image dimensions
+        boxes = self.rescale_boxes(boxes)
+        # Convert boxes to xyxy format
+        # boxes = xywh2xyxy(boxes)
+        return boxes
+    def rescale_boxes(self, boxes):
+        # Rescale boxes to original image dimensions
+        input_shape = np.array(
+            [self.input_width, self.input_height, self.input_width, self.input_height]
+        )
+        boxes = np.divide(boxes, input_shape, dtype=np.float32)
+        boxes *= np.array(
+            [self.img_width, self.img_height, self.img_width, self.img_height]
+        )
+        return boxes
+    def draw_detections(
+        self, image, boxes, scores, class_ids, draw_scores=True, mask_alpha=0.4
+    ):
+        return draw_detections(image, boxes, scores, class_ids, mask_alpha)
+    def get_input_details(self):
+        model_inputs = self.session.get_inputs()
+        self.input_names = [model_inputs[i].name for i in range(len(model_inputs))]
+        self.input_shape = model_inputs[0].shape
+        self.input_height = self.input_shape[2]
+        self.input_width = self.input_shape[3]
+    def get_output_details(self):
+        model_outputs = self.session.get_outputs()
+        self.output_names = [model_outputs[i].name for i in range(len(model_outputs))]
+if __name__ == "__main__":
+    import requests
+    import tempfile
+    from huggingface_hub import hf_hub_download
+    model_file = hf_hub_download(
+        repo_id="onnx-community/yolov10s", filename="onnx/model.onnx"
+    )
+    yolov8_detector = YOLOv10(model_file)
+    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
+        f.write(
+            requests.get(
+                "https://live.staticflickr.com/13/19041780_d6fd803de0_3k.jpg"
+            ).content
+        )
+        f.seek(0)
+        img = cv2.imread(f.name)
+    # # Detect Objects
+    combined_image = yolov8_detector.detect_objects(img)
+    # Draw detections
+    cv2.namedWindow("Output", cv2.WINDOW_NORMAL)
+    cv2.imshow("Output", combined_image)
+    cv2.waitKey(0)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio-client @ git+https://github.com/gradio-app/gradio@bbf9ba7e997022960c621f72baa891185bd03732#subdirectory=client/python
+https://gradio-pypi-previews.s3.amazonaws.com/bbf9ba7e997022960c621f72baa891185bd03732/gradio-5.0.0-py3-none-any.whl
+safetensors==0.4.3
+opencv-python
+twilio
+gradio>=5.0,<6.0
+gradio-webrtc==0.0.1
+onnxruntime-gpu

run.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: yolov10_webcam_stream"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio safetensors==0.4.3 opencv-python twilio gradio>=5.0,<6.0 gradio-webrtc==0.0.1 onnxruntime-gpu"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/yolov10_webcam_stream/inference.py\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/yolov10_webcam_stream/utils.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import cv2\n", "from huggingface_hub import hf_hub_download\n", "from gradio_webrtc import WebRTC # type: ignore\n", "from twilio.rest import Client # type: ignore\n", "import os\n", "from inference import YOLOv10 # type: ignore\n", "\n", "model_file = hf_hub_download(\n", " repo_id=\"onnx-community/yolov10n\", filename=\"onnx/model.onnx\"\n", ")\n", "\n", "model = YOLOv10(model_file)\n", "\n", "account_sid = os.environ.get(\"TWILIO_ACCOUNT_SID\")\n", "auth_token = os.environ.get(\"TWILIO_AUTH_TOKEN\")\n", "\n", "if account_sid and auth_token:\n", " client = Client(account_sid, auth_token)\n", "\n", " token = client.tokens.create()\n", "\n", " rtc_configuration = {\n", " \"iceServers\": token.ice_servers,\n", " \"iceTransportPolicy\": \"relay\",\n", " }\n", "else:\n", " rtc_configuration = None\n", "\n", "\n", "def detection(image, conf_threshold=0.3):\n", " image = cv2.resize(image, (model.input_width, model.input_height))\n", " new_image = model.detect_objects(image, conf_threshold)\n", " return cv2.resize(new_image, (500, 500))\n", "\n", "\n", "css = \"\"\".my-group {max-width: 600px !important; max-height: 600 !important;}\n", " .my-column {display: flex !important; justify-content: center !important; align-items: center !important};\"\"\"\n", "\n", "\n", "with gr.Blocks(css=css) as demo:\n", " gr.HTML(\n", " \"\"\"\n", " <h1 style='text-align: center'>\n", " YOLOv10 Webcam Stream (Powered by WebRTC \u26a1\ufe0f)\n", " </h1>\n", " \"\"\"\n", " )\n", " gr.HTML(\n", " \"\"\"\n", " <h3 style='text-align: center'>\n", " <a href='https://arxiv.org/abs/2405.14458' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>\n", " </h3>\n", " \"\"\"\n", " )\n", " with gr.Column(elem_classes=[\"my-column\"]):\n", " with gr.Group(elem_classes=[\"my-group\"]):\n", " image = WebRTC(label=\"Stream\", rtc_configuration=rtc_configuration)\n", " conf_threshold = gr.Slider(\n", " label=\"Confidence Threshold\",\n", " minimum=0.0,\n", " maximum=1.0,\n", " step=0.05,\n", " value=0.30,\n", " )\n", "\n", " image.stream(\n", " fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10\n", " )\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}

run.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import gradio as gr
+import cv2
+from huggingface_hub import hf_hub_download
+from gradio_webrtc import WebRTC  # type: ignore
+from twilio.rest import Client  # type: ignore
+import os
+from inference import YOLOv10  # type: ignore
+model_file = hf_hub_download(
+    repo_id="onnx-community/yolov10n", filename="onnx/model.onnx"
+)
+model = YOLOv10(model_file)
+account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
+auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
+if account_sid and auth_token:
+    client = Client(account_sid, auth_token)
+    token = client.tokens.create()
+    rtc_configuration = {
+        "iceServers": token.ice_servers,
+        "iceTransportPolicy": "relay",
+    }
+else:
+    rtc_configuration = None
+def detection(image, conf_threshold=0.3):
+    image = cv2.resize(image, (model.input_width, model.input_height))
+    new_image = model.detect_objects(image, conf_threshold)
+    return cv2.resize(new_image, (500, 500))
+css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
+                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
+with gr.Blocks(css=css) as demo:
+    gr.HTML(
+        """
+    <h1 style='text-align: center'>
+    YOLOv10 Webcam Stream (Powered by WebRTC ⚡️)
+    </h1>
+    """
+    )
+    gr.HTML(
+        """
+        <h3 style='text-align: center'>
+        <a href='https://arxiv.org/abs/2405.14458' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>
+        </h3>
+        """
+    )
+    with gr.Column(elem_classes=["my-column"]):
+        with gr.Group(elem_classes=["my-group"]):
+            image = WebRTC(label="Stream", rtc_configuration=rtc_configuration)
+            conf_threshold = gr.Slider(
+                label="Confidence Threshold",
+                minimum=0.0,
+                maximum=1.0,
+                step=0.05,
+                value=0.30,
+            )
+        image.stream(
+            fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10
+        )
+if __name__ == "__main__":
+    demo.launch()

utils.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import numpy as np
+import cv2
+class_names = [
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+]
+# Create a list of colors for each class where each color is a tuple of 3 integer values
+rng = np.random.default_rng(3)
+colors = rng.uniform(0, 255, size=(len(class_names), 3))
+def nms(boxes, scores, iou_threshold):
+    # Sort by score
+    sorted_indices = np.argsort(scores)[::-1]
+    keep_boxes = []
+    while sorted_indices.size > 0:
+        # Pick the last box
+        box_id = sorted_indices[0]
+        keep_boxes.append(box_id)
+        # Compute IoU of the picked box with the rest
+        ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
+        # Remove boxes with IoU over the threshold
+        keep_indices = np.where(ious < iou_threshold)[0]
+        # print(keep_indices.shape, sorted_indices.shape)
+        sorted_indices = sorted_indices[keep_indices + 1]
+    return keep_boxes
+def multiclass_nms(boxes, scores, class_ids, iou_threshold):
+    unique_class_ids = np.unique(class_ids)
+    keep_boxes = []
+    for class_id in unique_class_ids:
+        class_indices = np.where(class_ids == class_id)[0]
+        class_boxes = boxes[class_indices, :]
+        class_scores = scores[class_indices]
+        class_keep_boxes = nms(class_boxes, class_scores, iou_threshold)
+        keep_boxes.extend(class_indices[class_keep_boxes])
+    return keep_boxes
+def compute_iou(box, boxes):
+    # Compute xmin, ymin, xmax, ymax for both boxes
+    xmin = np.maximum(box[0], boxes[:, 0])
+    ymin = np.maximum(box[1], boxes[:, 1])
+    xmax = np.minimum(box[2], boxes[:, 2])
+    ymax = np.minimum(box[3], boxes[:, 3])
+    # Compute intersection area
+    intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
+    # Compute union area
+    box_area = (box[2] - box[0]) * (box[3] - box[1])
+    boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+    union_area = box_area + boxes_area - intersection_area
+    # Compute IoU
+    iou = intersection_area / union_area
+    return iou
+def xywh2xyxy(x):
+    # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2)
+    y = np.copy(x)
+    y[..., 0] = x[..., 0] - x[..., 2] / 2
+    y[..., 1] = x[..., 1] - x[..., 3] / 2
+    y[..., 2] = x[..., 0] + x[..., 2] / 2
+    y[..., 3] = x[..., 1] + x[..., 3] / 2
+    return y
+def draw_detections(image, boxes, scores, class_ids, mask_alpha=0.3):
+    det_img = image.copy()
+    img_height, img_width = image.shape[:2]
+    font_size = min([img_height, img_width]) * 0.0006
+    text_thickness = int(min([img_height, img_width]) * 0.001)
+    # det_img = draw_masks(det_img, boxes, class_ids, mask_alpha)
+    # Draw bounding boxes and labels of detections
+    for class_id, box, score in zip(class_ids, boxes, scores):
+        color = colors[class_id]
+        draw_box(det_img, box, color)  # type: ignore
+        label = class_names[class_id]
+        caption = f"{label} {int(score * 100)}%"
+        draw_text(det_img, caption, box, color, font_size, text_thickness)  # type: ignore
+    return det_img
+def draw_box(
+    image: np.ndarray,
+    box: np.ndarray,
+    color: tuple[int, int, int] = (0, 0, 255),
+    thickness: int = 2,
+) -> np.ndarray:
+    x1, y1, x2, y2 = box.astype(int)
+    return cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)
+def draw_text(
+    image: np.ndarray,
+    text: str,
+    box: np.ndarray,
+    color: tuple[int, int, int] = (0, 0, 255),
+    font_size: float = 0.001,
+    text_thickness: int = 2,
+) -> np.ndarray:
+    x1, y1, x2, y2 = box.astype(int)
+    (tw, th), _ = cv2.getTextSize(
+        text=text,
+        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
+        fontScale=font_size,
+        thickness=text_thickness,
+    )
+    th = int(th * 1.2)
+    cv2.rectangle(image, (x1, y1), (x1 + tw, y1 - th), color, -1)
+    return cv2.putText(
+        image,
+        text,
+        (x1, y1),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_size,
+        (255, 255, 255),
+        text_thickness,
+        cv2.LINE_AA,
+    )
+def draw_masks(
+    image: np.ndarray, boxes: np.ndarray, classes: np.ndarray, mask_alpha: float = 0.3
+) -> np.ndarray:
+    mask_img = image.copy()
+    # Draw bounding boxes and labels of detections
+    for box, class_id in zip(boxes, classes):
+        color = colors[class_id]
+        x1, y1, x2, y2 = box.astype(int)
+        # Draw fill rectangle in mask image
+        cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, -1)  # type: ignore
+    return cv2.addWeighted(mask_img, mask_alpha, image, 1 - mask_alpha, 0)