freddyaboulton HF Staff commited on
Commit
9bd282f
·
verified ·
1 Parent(s): 88eaa72

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. README.md +8 -8
  2. inference.py +148 -0
  3. requirements.txt +8 -0
  4. run.ipynb +1 -0
  5. run.py +72 -0
  6. utils.py +237 -0
README.md CHANGED
@@ -1,12 +1,12 @@
 
1
  ---
2
- title: Yolov10 Webcam Stream Main
3
- emoji: 👁
4
- colorFrom: pink
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.44.1
8
- app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+
2
  ---
3
+ title: yolov10_webcam_stream_main
4
+ emoji: 🔥
5
+ colorFrom: indigo
6
+ colorTo: indigo
7
  sdk: gradio
8
+ sdk_version: 5.0.0
9
+ app_file: run.py
10
  pinned: false
11
+ hf_oauth: true
12
  ---
 
 
inference.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import cv2
3
+ import numpy as np
4
+ import onnxruntime # type: ignore
5
+
6
+ from utils import draw_detections # type: ignore
7
+
8
+
9
+ class YOLOv10:
10
+ def __init__(self, path):
11
+ # Initialize model
12
+ self.initialize_model(path)
13
+
14
+ def __call__(self, image):
15
+ return self.detect_objects(image)
16
+
17
+ def initialize_model(self, path):
18
+ self.session = onnxruntime.InferenceSession(
19
+ path, providers=onnxruntime.get_available_providers()
20
+ )
21
+ # Get model info
22
+ self.get_input_details()
23
+ self.get_output_details()
24
+
25
+ def detect_objects(self, image, conf_threshold=0.3):
26
+ input_tensor = self.prepare_input(image)
27
+
28
+ # Perform inference on the image
29
+ new_image = self.inference(image, input_tensor, conf_threshold)
30
+
31
+ return new_image
32
+
33
+ def prepare_input(self, image):
34
+ self.img_height, self.img_width = image.shape[:2]
35
+
36
+ input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
37
+
38
+ # Resize input image
39
+ input_img = cv2.resize(input_img, (self.input_width, self.input_height))
40
+
41
+ # Scale input pixel values to 0 to 1
42
+ input_img = input_img / 255.0
43
+ input_img = input_img.transpose(2, 0, 1)
44
+ input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
45
+
46
+ return input_tensor
47
+
48
+ def inference(self, image, input_tensor, conf_threshold=0.3):
49
+ start = time.perf_counter()
50
+ outputs = self.session.run(
51
+ self.output_names, {self.input_names[0]: input_tensor}
52
+ )
53
+
54
+ print(f"Inference time: {(time.perf_counter() - start)*1000:.2f} ms")
55
+ (
56
+ boxes,
57
+ scores,
58
+ class_ids,
59
+ ) = self.process_output(outputs, conf_threshold)
60
+ return self.draw_detections(image, boxes, scores, class_ids)
61
+
62
+ def process_output(self, output, conf_threshold=0.3):
63
+ predictions = np.squeeze(output[0])
64
+
65
+ # Filter out object confidence scores below threshold
66
+ scores = predictions[:, 4]
67
+ predictions = predictions[scores > conf_threshold, :]
68
+ scores = scores[scores > conf_threshold]
69
+
70
+ if len(scores) == 0:
71
+ return [], [], []
72
+
73
+ # Get the class with the highest confidence
74
+ class_ids = predictions[:, 5].astype(int)
75
+
76
+ # Get bounding boxes for each object
77
+ boxes = self.extract_boxes(predictions)
78
+
79
+ return boxes, scores, class_ids
80
+
81
+ def extract_boxes(self, predictions):
82
+ # Extract boxes from predictions
83
+ boxes = predictions[:, :4]
84
+
85
+ # Scale boxes to original image dimensions
86
+ boxes = self.rescale_boxes(boxes)
87
+
88
+ # Convert boxes to xyxy format
89
+ # boxes = xywh2xyxy(boxes)
90
+
91
+ return boxes
92
+
93
+ def rescale_boxes(self, boxes):
94
+ # Rescale boxes to original image dimensions
95
+ input_shape = np.array(
96
+ [self.input_width, self.input_height, self.input_width, self.input_height]
97
+ )
98
+ boxes = np.divide(boxes, input_shape, dtype=np.float32)
99
+ boxes *= np.array(
100
+ [self.img_width, self.img_height, self.img_width, self.img_height]
101
+ )
102
+ return boxes
103
+
104
+ def draw_detections(
105
+ self, image, boxes, scores, class_ids, draw_scores=True, mask_alpha=0.4
106
+ ):
107
+ return draw_detections(image, boxes, scores, class_ids, mask_alpha)
108
+
109
+ def get_input_details(self):
110
+ model_inputs = self.session.get_inputs()
111
+ self.input_names = [model_inputs[i].name for i in range(len(model_inputs))]
112
+
113
+ self.input_shape = model_inputs[0].shape
114
+ self.input_height = self.input_shape[2]
115
+ self.input_width = self.input_shape[3]
116
+
117
+ def get_output_details(self):
118
+ model_outputs = self.session.get_outputs()
119
+ self.output_names = [model_outputs[i].name for i in range(len(model_outputs))]
120
+
121
+
122
+ if __name__ == "__main__":
123
+ import requests
124
+ import tempfile
125
+ from huggingface_hub import hf_hub_download
126
+
127
+ model_file = hf_hub_download(
128
+ repo_id="onnx-community/yolov10s", filename="onnx/model.onnx"
129
+ )
130
+
131
+ yolov8_detector = YOLOv10(model_file)
132
+
133
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
134
+ f.write(
135
+ requests.get(
136
+ "https://live.staticflickr.com/13/19041780_d6fd803de0_3k.jpg"
137
+ ).content
138
+ )
139
+ f.seek(0)
140
+ img = cv2.imread(f.name)
141
+
142
+ # # Detect Objects
143
+ combined_image = yolov8_detector.detect_objects(img)
144
+
145
+ # Draw detections
146
+ cv2.namedWindow("Output", cv2.WINDOW_NORMAL)
147
+ cv2.imshow("Output", combined_image)
148
+ cv2.waitKey(0)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio-client @ git+https://github.com/gradio-app/gradio@bbf9ba7e997022960c621f72baa891185bd03732#subdirectory=client/python
2
+ https://gradio-pypi-previews.s3.amazonaws.com/bbf9ba7e997022960c621f72baa891185bd03732/gradio-5.0.0-py3-none-any.whl
3
+ safetensors==0.4.3
4
+ opencv-python
5
+ twilio
6
+ gradio>=5.0,<6.0
7
+ gradio-webrtc==0.0.1
8
+ onnxruntime-gpu
run.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: yolov10_webcam_stream"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio safetensors==0.4.3 opencv-python twilio gradio>=5.0,<6.0 gradio-webrtc==0.0.1 onnxruntime-gpu"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/yolov10_webcam_stream/inference.py\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/yolov10_webcam_stream/utils.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import cv2\n", "from huggingface_hub import hf_hub_download\n", "from gradio_webrtc import WebRTC # type: ignore\n", "from twilio.rest import Client # type: ignore\n", "import os\n", "from inference import YOLOv10 # type: ignore\n", "\n", "model_file = hf_hub_download(\n", " repo_id=\"onnx-community/yolov10n\", filename=\"onnx/model.onnx\"\n", ")\n", "\n", "model = YOLOv10(model_file)\n", "\n", "account_sid = os.environ.get(\"TWILIO_ACCOUNT_SID\")\n", "auth_token = os.environ.get(\"TWILIO_AUTH_TOKEN\")\n", "\n", "if account_sid and auth_token:\n", " client = Client(account_sid, auth_token)\n", "\n", " token = client.tokens.create()\n", "\n", " rtc_configuration = {\n", " \"iceServers\": token.ice_servers,\n", " \"iceTransportPolicy\": \"relay\",\n", " }\n", "else:\n", " rtc_configuration = None\n", "\n", "\n", "def detection(image, conf_threshold=0.3):\n", " image = cv2.resize(image, (model.input_width, model.input_height))\n", " new_image = model.detect_objects(image, conf_threshold)\n", " return cv2.resize(new_image, (500, 500))\n", "\n", "\n", "css = \"\"\".my-group {max-width: 600px !important; max-height: 600 !important;}\n", " .my-column {display: flex !important; justify-content: center !important; align-items: center !important};\"\"\"\n", "\n", "\n", "with gr.Blocks(css=css) as demo:\n", " gr.HTML(\n", " \"\"\"\n", " <h1 style='text-align: center'>\n", " YOLOv10 Webcam Stream (Powered by WebRTC \u26a1\ufe0f)\n", " </h1>\n", " \"\"\"\n", " )\n", " gr.HTML(\n", " \"\"\"\n", " <h3 style='text-align: center'>\n", " <a href='https://arxiv.org/abs/2405.14458' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>\n", " </h3>\n", " \"\"\"\n", " )\n", " with gr.Column(elem_classes=[\"my-column\"]):\n", " with gr.Group(elem_classes=[\"my-group\"]):\n", " image = WebRTC(label=\"Stream\", rtc_configuration=rtc_configuration)\n", " conf_threshold = gr.Slider(\n", " label=\"Confidence Threshold\",\n", " minimum=0.0,\n", " maximum=1.0,\n", " step=0.05,\n", " value=0.30,\n", " )\n", "\n", " image.stream(\n", " fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10\n", " )\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
run.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ from huggingface_hub import hf_hub_download
4
+ from gradio_webrtc import WebRTC # type: ignore
5
+ from twilio.rest import Client # type: ignore
6
+ import os
7
+ from inference import YOLOv10 # type: ignore
8
+
9
+ model_file = hf_hub_download(
10
+ repo_id="onnx-community/yolov10n", filename="onnx/model.onnx"
11
+ )
12
+
13
+ model = YOLOv10(model_file)
14
+
15
+ account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
16
+ auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
17
+
18
+ if account_sid and auth_token:
19
+ client = Client(account_sid, auth_token)
20
+
21
+ token = client.tokens.create()
22
+
23
+ rtc_configuration = {
24
+ "iceServers": token.ice_servers,
25
+ "iceTransportPolicy": "relay",
26
+ }
27
+ else:
28
+ rtc_configuration = None
29
+
30
+
31
+ def detection(image, conf_threshold=0.3):
32
+ image = cv2.resize(image, (model.input_width, model.input_height))
33
+ new_image = model.detect_objects(image, conf_threshold)
34
+ return cv2.resize(new_image, (500, 500))
35
+
36
+
37
+ css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
38
+ .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
39
+
40
+
41
+ with gr.Blocks(css=css) as demo:
42
+ gr.HTML(
43
+ """
44
+ <h1 style='text-align: center'>
45
+ YOLOv10 Webcam Stream (Powered by WebRTC ⚡️)
46
+ </h1>
47
+ """
48
+ )
49
+ gr.HTML(
50
+ """
51
+ <h3 style='text-align: center'>
52
+ <a href='https://arxiv.org/abs/2405.14458' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>
53
+ </h3>
54
+ """
55
+ )
56
+ with gr.Column(elem_classes=["my-column"]):
57
+ with gr.Group(elem_classes=["my-group"]):
58
+ image = WebRTC(label="Stream", rtc_configuration=rtc_configuration)
59
+ conf_threshold = gr.Slider(
60
+ label="Confidence Threshold",
61
+ minimum=0.0,
62
+ maximum=1.0,
63
+ step=0.05,
64
+ value=0.30,
65
+ )
66
+
67
+ image.stream(
68
+ fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10
69
+ )
70
+
71
+ if __name__ == "__main__":
72
+ demo.launch()
utils.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+
4
+ class_names = [
5
+ "person",
6
+ "bicycle",
7
+ "car",
8
+ "motorcycle",
9
+ "airplane",
10
+ "bus",
11
+ "train",
12
+ "truck",
13
+ "boat",
14
+ "traffic light",
15
+ "fire hydrant",
16
+ "stop sign",
17
+ "parking meter",
18
+ "bench",
19
+ "bird",
20
+ "cat",
21
+ "dog",
22
+ "horse",
23
+ "sheep",
24
+ "cow",
25
+ "elephant",
26
+ "bear",
27
+ "zebra",
28
+ "giraffe",
29
+ "backpack",
30
+ "umbrella",
31
+ "handbag",
32
+ "tie",
33
+ "suitcase",
34
+ "frisbee",
35
+ "skis",
36
+ "snowboard",
37
+ "sports ball",
38
+ "kite",
39
+ "baseball bat",
40
+ "baseball glove",
41
+ "skateboard",
42
+ "surfboard",
43
+ "tennis racket",
44
+ "bottle",
45
+ "wine glass",
46
+ "cup",
47
+ "fork",
48
+ "knife",
49
+ "spoon",
50
+ "bowl",
51
+ "banana",
52
+ "apple",
53
+ "sandwich",
54
+ "orange",
55
+ "broccoli",
56
+ "carrot",
57
+ "hot dog",
58
+ "pizza",
59
+ "donut",
60
+ "cake",
61
+ "chair",
62
+ "couch",
63
+ "potted plant",
64
+ "bed",
65
+ "dining table",
66
+ "toilet",
67
+ "tv",
68
+ "laptop",
69
+ "mouse",
70
+ "remote",
71
+ "keyboard",
72
+ "cell phone",
73
+ "microwave",
74
+ "oven",
75
+ "toaster",
76
+ "sink",
77
+ "refrigerator",
78
+ "book",
79
+ "clock",
80
+ "vase",
81
+ "scissors",
82
+ "teddy bear",
83
+ "hair drier",
84
+ "toothbrush",
85
+ ]
86
+
87
+ # Create a list of colors for each class where each color is a tuple of 3 integer values
88
+ rng = np.random.default_rng(3)
89
+ colors = rng.uniform(0, 255, size=(len(class_names), 3))
90
+
91
+
92
+ def nms(boxes, scores, iou_threshold):
93
+ # Sort by score
94
+ sorted_indices = np.argsort(scores)[::-1]
95
+
96
+ keep_boxes = []
97
+ while sorted_indices.size > 0:
98
+ # Pick the last box
99
+ box_id = sorted_indices[0]
100
+ keep_boxes.append(box_id)
101
+
102
+ # Compute IoU of the picked box with the rest
103
+ ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
104
+
105
+ # Remove boxes with IoU over the threshold
106
+ keep_indices = np.where(ious < iou_threshold)[0]
107
+
108
+ # print(keep_indices.shape, sorted_indices.shape)
109
+ sorted_indices = sorted_indices[keep_indices + 1]
110
+
111
+ return keep_boxes
112
+
113
+
114
+ def multiclass_nms(boxes, scores, class_ids, iou_threshold):
115
+ unique_class_ids = np.unique(class_ids)
116
+
117
+ keep_boxes = []
118
+ for class_id in unique_class_ids:
119
+ class_indices = np.where(class_ids == class_id)[0]
120
+ class_boxes = boxes[class_indices, :]
121
+ class_scores = scores[class_indices]
122
+
123
+ class_keep_boxes = nms(class_boxes, class_scores, iou_threshold)
124
+ keep_boxes.extend(class_indices[class_keep_boxes])
125
+
126
+ return keep_boxes
127
+
128
+
129
+ def compute_iou(box, boxes):
130
+ # Compute xmin, ymin, xmax, ymax for both boxes
131
+ xmin = np.maximum(box[0], boxes[:, 0])
132
+ ymin = np.maximum(box[1], boxes[:, 1])
133
+ xmax = np.minimum(box[2], boxes[:, 2])
134
+ ymax = np.minimum(box[3], boxes[:, 3])
135
+
136
+ # Compute intersection area
137
+ intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
138
+
139
+ # Compute union area
140
+ box_area = (box[2] - box[0]) * (box[3] - box[1])
141
+ boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
142
+ union_area = box_area + boxes_area - intersection_area
143
+
144
+ # Compute IoU
145
+ iou = intersection_area / union_area
146
+
147
+ return iou
148
+
149
+
150
+ def xywh2xyxy(x):
151
+ # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2)
152
+ y = np.copy(x)
153
+ y[..., 0] = x[..., 0] - x[..., 2] / 2
154
+ y[..., 1] = x[..., 1] - x[..., 3] / 2
155
+ y[..., 2] = x[..., 0] + x[..., 2] / 2
156
+ y[..., 3] = x[..., 1] + x[..., 3] / 2
157
+ return y
158
+
159
+
160
+ def draw_detections(image, boxes, scores, class_ids, mask_alpha=0.3):
161
+ det_img = image.copy()
162
+
163
+ img_height, img_width = image.shape[:2]
164
+ font_size = min([img_height, img_width]) * 0.0006
165
+ text_thickness = int(min([img_height, img_width]) * 0.001)
166
+
167
+ # det_img = draw_masks(det_img, boxes, class_ids, mask_alpha)
168
+
169
+ # Draw bounding boxes and labels of detections
170
+ for class_id, box, score in zip(class_ids, boxes, scores):
171
+ color = colors[class_id]
172
+
173
+ draw_box(det_img, box, color) # type: ignore
174
+
175
+ label = class_names[class_id]
176
+ caption = f"{label} {int(score * 100)}%"
177
+ draw_text(det_img, caption, box, color, font_size, text_thickness) # type: ignore
178
+
179
+ return det_img
180
+
181
+
182
+ def draw_box(
183
+ image: np.ndarray,
184
+ box: np.ndarray,
185
+ color: tuple[int, int, int] = (0, 0, 255),
186
+ thickness: int = 2,
187
+ ) -> np.ndarray:
188
+ x1, y1, x2, y2 = box.astype(int)
189
+ return cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)
190
+
191
+
192
+ def draw_text(
193
+ image: np.ndarray,
194
+ text: str,
195
+ box: np.ndarray,
196
+ color: tuple[int, int, int] = (0, 0, 255),
197
+ font_size: float = 0.001,
198
+ text_thickness: int = 2,
199
+ ) -> np.ndarray:
200
+ x1, y1, x2, y2 = box.astype(int)
201
+ (tw, th), _ = cv2.getTextSize(
202
+ text=text,
203
+ fontFace=cv2.FONT_HERSHEY_SIMPLEX,
204
+ fontScale=font_size,
205
+ thickness=text_thickness,
206
+ )
207
+ th = int(th * 1.2)
208
+
209
+ cv2.rectangle(image, (x1, y1), (x1 + tw, y1 - th), color, -1)
210
+
211
+ return cv2.putText(
212
+ image,
213
+ text,
214
+ (x1, y1),
215
+ cv2.FONT_HERSHEY_SIMPLEX,
216
+ font_size,
217
+ (255, 255, 255),
218
+ text_thickness,
219
+ cv2.LINE_AA,
220
+ )
221
+
222
+
223
+ def draw_masks(
224
+ image: np.ndarray, boxes: np.ndarray, classes: np.ndarray, mask_alpha: float = 0.3
225
+ ) -> np.ndarray:
226
+ mask_img = image.copy()
227
+
228
+ # Draw bounding boxes and labels of detections
229
+ for box, class_id in zip(boxes, classes):
230
+ color = colors[class_id]
231
+
232
+ x1, y1, x2, y2 = box.astype(int)
233
+
234
+ # Draw fill rectangle in mask image
235
+ cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, -1) # type: ignore
236
+
237
+ return cv2.addWeighted(mask_img, mask_alpha, image, 1 - mask_alpha, 0)