benchmark: add vittrack benchmark results (#214)
Browse files* create vittrack.py for benchmark; modify demo.py to use VitTrack
* add benchmark config
* add benchmark results on some hardware
* update benchmark results on jetson, jetson orin, vim4
* update results on sunrise x3, 200i dk, rv1126
* update benchmark results on atlas 200 and maix
* improve doc; update results on cuda
* doc correction
- benchmark/README.md +56 -3
- benchmark/color_table.svg +187 -7
- benchmark/config/object_tracking_vittrack.yaml +14 -0
- benchmark/table_config.yaml +7 -0
- models/__init__.py +2 -0
- models/object_tracking_vittrack/README.md +7 -6
- models/object_tracking_vittrack/demo.py +38 -12
- models/object_tracking_vittrack/vittrack.py +39 -0
benchmark/README.md
CHANGED
@@ -94,6 +94,7 @@ mean median min input size model
|
|
94 |
46.10 47.53 43.06 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
95 |
144.89 149.58 125.71 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
96 |
143.83 146.39 119.75 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
97 |
12.99 13.11 12.14 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
98 |
12.64 12.44 10.82 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
99 |
12.64 11.83 11.03 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -148,6 +149,7 @@ mean median min input size model
|
|
148 |
212.90 212.93 209.55 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
149 |
1690.06 2303.34 1480.63 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
150 |
1489.54 1435.48 1308.12 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
151 |
356.63 357.29 354.42 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
152 |
217.52 229.39 101.61 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
153 |
198.63 198.25 196.68 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -203,6 +205,7 @@ mean median min input size model
|
|
203 |
216.18 216.19 214.30 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
204 |
1207.83 1208.71 1203.64 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
205 |
1236.98 1250.21 1203.64 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
206 |
124.89 125.25 124.53 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
207 |
107.99 109.82 94.05 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
208 |
108.41 108.33 107.91 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -241,6 +244,7 @@ mean median min input size model
|
|
241 |
54.24 55.24 52.87 [320, 240] LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
|
242 |
63.63 63.43 63.32 [416, 416] NanoDet with ['object_detection_nanodet_2022nov.onnx']
|
243 |
371.45 378.00 366.39 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
|
|
244 |
33.85 33.90 33.61 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
245 |
38.16 37.33 37.10 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
246 |
91.65 91.98 89.90 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
|
@@ -271,6 +275,7 @@ mean median min input size model
|
|
271 |
366.46 366.88 363.46 [320, 240] LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
|
272 |
163.06 163.34 161.77 [416, 416] NanoDet with ['object_detection_nanodet_2022nov.onnx']
|
273 |
301.10 311.52 297.74 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
|
|
274 |
149.37 149.95 148.01 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
275 |
153.89 153.96 153.43 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
276 |
44.29 44.03 43.62 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
|
@@ -318,6 +323,7 @@ mean median min input size model
|
|
318 |
212.69 262.75 170.88 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
319 |
1110.87 1112.27 1085.31 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
320 |
1128.73 1157.12 1085.31 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
321 |
147.01 144.01 139.27 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
322 |
119.70 118.95 94.09 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
323 |
107.63 107.09 105.61 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -398,6 +404,7 @@ mean median min input size model
|
|
398 |
322.98 323.45 312.13 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
399 |
1875.33 1877.53 1871.26 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
400 |
1989.04 2005.25 1871.26 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
401 |
159.80 159.62 159.40 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
402 |
152.18 152.86 145.56 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
403 |
145.83 145.77 145.45 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -418,8 +425,10 @@ mean median min input size model
|
|
418 |
|
419 |
NPU (CANN):
|
420 |
|
|
|
|
|
421 |
```
|
422 |
-
$ python3 benchmark.py --all --fp32 --cfg_exclude wechat:crnn --model_exclude pose_estimation_mediapipe_2023mar.onnx --cfg_overwrite_backend_target 4
|
423 |
Benchmarking ...
|
424 |
backend=cv.dnn.DNN_BACKEND_CANN
|
425 |
target=cv.dnn.DNN_TARGET_NPU
|
@@ -478,6 +487,7 @@ mean median min input size model
|
|
478 |
1903.82 1962.71 1533.79 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
479 |
37604.10 37569.30 37502.48 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
480 |
24229.20 25577.94 13483.54 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
481 |
1133.44 1131.54 1124.83 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
482 |
883.96 919.07 655.33 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
483 |
1430.98 1424.55 1415.68 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -533,6 +543,7 @@ mean median min input size model
|
|
533 |
117.28 150.31 83.33 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
534 |
553.58 558.76 535.47 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
535 |
594.18 592.64 535.47 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
536 |
56.35 55.73 55.25 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
537 |
57.07 57.19 55.25 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
538 |
47.94 48.41 47.05 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -587,6 +598,7 @@ mean median min input size model
|
|
587 |
406.28 416.58 385.68 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
588 |
2608.90 2612.42 2597.93 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
589 |
2609.88 2609.39 2597.93 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
590 |
228.95 228.74 228.35 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
591 |
227.97 228.61 226.76 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
592 |
192.29 192.26 191.74 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -643,6 +655,7 @@ mean median min input size model
|
|
643 |
3002.36 3047.94 2655.38 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
644 |
50678.08 50651.82 50651.19 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
645 |
36249.71 37771.22 24606.37 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
646 |
1502.15 1501.98 1500.99 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
647 |
1300.15 1320.44 1137.60 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
648 |
1993.05 1993.98 1991.86 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -669,9 +682,8 @@ Specs: [details_cn](https://doc.rvspace.org/VisionFive2/PB/VisionFive_2/specific
|
|
669 |
|
670 |
CPU:
|
671 |
<!-- config wechat is excluded due to it needs building with opencv_contrib -->
|
672 |
-
<!-- config vittrack is excluded due to opencv cannot find ffmpeg and its components -->
|
673 |
```
|
674 |
-
$ python3 benchmark.py --all --cfg_exclude wechat
|
675 |
Benchmarking ...
|
676 |
backend=cv.dnn.DNN_BACKEND_OPENCV
|
677 |
target=cv.dnn.DNN_TARGET_CPU
|
@@ -698,6 +710,7 @@ mean median min input size model
|
|
698 |
1434.56 1463.32 1194.57 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
699 |
26172.62 26160.04 26151.67 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
700 |
17004.06 17909.88 9659.54 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
|
|
701 |
734.97 735.58 733.95 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
702 |
609.61 621.69 508.04 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
703 |
961.41 962.26 960.39 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
@@ -715,3 +728,43 @@ mean median min input size model
|
|
715 |
1237.00 1395.68 807.66 [1280, 720] CRNN with ['text_recognition_CRNN_CN_2021nov_int8.onnx']
|
716 |
1169.59 1415.29 774.09 [1280, 720] CRNN with ['text_recognition_CRNN_EN_2022oct_int8.onnx']
|
717 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
46.10 47.53 43.06 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
95 |
144.89 149.58 125.71 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
96 |
143.83 146.39 119.75 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
97 |
+
12.52 14.47 11.63 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
98 |
12.99 13.11 12.14 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
99 |
12.64 12.44 10.82 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
100 |
12.64 11.83 11.03 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
149 |
212.90 212.93 209.55 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
150 |
1690.06 2303.34 1480.63 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
151 |
1489.54 1435.48 1308.12 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
152 |
+
90.49 89.23 86.83 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
153 |
356.63 357.29 354.42 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
154 |
217.52 229.39 101.61 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
155 |
198.63 198.25 196.68 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
205 |
216.18 216.19 214.30 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
206 |
1207.83 1208.71 1203.64 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
207 |
1236.98 1250.21 1203.64 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
208 |
+
123.30 125.37 116.69 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
209 |
124.89 125.25 124.53 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
210 |
107.99 109.82 94.05 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
211 |
108.41 108.33 107.91 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
244 |
54.24 55.24 52.87 [320, 240] LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
|
245 |
63.63 63.43 63.32 [416, 416] NanoDet with ['object_detection_nanodet_2022nov.onnx']
|
246 |
371.45 378.00 366.39 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
247 |
+
43.06 42.32 39.92 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
248 |
33.85 33.90 33.61 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
249 |
38.16 37.33 37.10 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
250 |
91.65 91.98 89.90 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
|
|
|
275 |
366.46 366.88 363.46 [320, 240] LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
|
276 |
163.06 163.34 161.77 [416, 416] NanoDet with ['object_detection_nanodet_2022nov.onnx']
|
277 |
301.10 311.52 297.74 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
278 |
+
43.36 40.65 39.85 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
279 |
149.37 149.95 148.01 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
280 |
153.89 153.96 153.43 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
281 |
44.29 44.03 43.62 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
|
|
|
323 |
212.69 262.75 170.88 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
324 |
1110.87 1112.27 1085.31 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
325 |
1128.73 1157.12 1085.31 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
326 |
+
67.31 67.41 66.23 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
327 |
147.01 144.01 139.27 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
328 |
119.70 118.95 94.09 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
329 |
107.63 107.09 105.61 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
404 |
322.98 323.45 312.13 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
405 |
1875.33 1877.53 1871.26 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
406 |
1989.04 2005.25 1871.26 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
407 |
+
143.62 143.19 137.16 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
408 |
159.80 159.62 159.40 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
409 |
152.18 152.86 145.56 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
410 |
145.83 145.77 145.45 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
425 |
|
426 |
NPU (CANN):
|
427 |
|
428 |
+
<!-- vittrack is excluded due to HardSwish is not supported by CANN backend yet -->
|
429 |
+
|
430 |
```
|
431 |
+
$ python3 benchmark.py --all --fp32 --cfg_exclude wechat:crnn:vittrack --model_exclude pose_estimation_mediapipe_2023mar.onnx --cfg_overwrite_backend_target 4
|
432 |
Benchmarking ...
|
433 |
backend=cv.dnn.DNN_BACKEND_CANN
|
434 |
target=cv.dnn.DNN_TARGET_NPU
|
|
|
487 |
1903.82 1962.71 1533.79 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
488 |
37604.10 37569.30 37502.48 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
489 |
24229.20 25577.94 13483.54 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
490 |
+
415.72 403.04 399.44 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
491 |
1133.44 1131.54 1124.83 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
492 |
883.96 919.07 655.33 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
493 |
1430.98 1424.55 1415.68 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
543 |
117.28 150.31 83.33 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
544 |
553.58 558.76 535.47 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
545 |
594.18 592.64 535.47 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
546 |
+
49.47 49.21 48.84 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
547 |
56.35 55.73 55.25 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
548 |
57.07 57.19 55.25 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
549 |
47.94 48.41 47.05 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
598 |
406.28 416.58 385.68 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
599 |
2608.90 2612.42 2597.93 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
600 |
2609.88 2609.39 2597.93 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
601 |
+
189.23 188.72 182.28 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
602 |
228.95 228.74 228.35 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
603 |
227.97 228.61 226.76 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
604 |
192.29 192.26 191.74 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
655 |
3002.36 3047.94 2655.38 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
656 |
50678.08 50651.82 50651.19 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
657 |
36249.71 37771.22 24606.37 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
658 |
+
707.79 706.32 699.40 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
659 |
1502.15 1501.98 1500.99 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
660 |
1300.15 1320.44 1137.60 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
661 |
1993.05 1993.98 1991.86 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
682 |
|
683 |
CPU:
|
684 |
<!-- config wechat is excluded due to it needs building with opencv_contrib -->
|
|
|
685 |
```
|
686 |
+
$ python3 benchmark.py --all --cfg_exclude wechat
|
687 |
Benchmarking ...
|
688 |
backend=cv.dnn.DNN_BACKEND_OPENCV
|
689 |
target=cv.dnn.DNN_TARGET_CPU
|
|
|
710 |
1434.56 1463.32 1194.57 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
|
711 |
26172.62 26160.04 26151.67 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
|
712 |
17004.06 17909.88 9659.54 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
|
713 |
+
304.58 309.56 280.05 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
714 |
734.97 735.58 733.95 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
|
715 |
609.61 621.69 508.04 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
|
716 |
961.41 962.26 960.39 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
|
|
|
728 |
1237.00 1395.68 807.66 [1280, 720] CRNN with ['text_recognition_CRNN_CN_2021nov_int8.onnx']
|
729 |
1169.59 1415.29 774.09 [1280, 720] CRNN with ['text_recognition_CRNN_EN_2022oct_int8.onnx']
|
730 |
```
|
731 |
+
|
732 |
+
<!--
|
733 |
+
|
734 |
+
### Khadas VIM4
|
735 |
+
|
736 |
+
CPU:
|
737 |
+
|
738 |
+
```
|
739 |
+
67.65 67.84 66.39 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
740 |
+
```
|
741 |
+
|
742 |
+
### NVIDIA Jetson Orin Nano
|
743 |
+
|
744 |
+
CPU:
|
745 |
+
|
746 |
+
```
|
747 |
+
59.30 58.45 57.90 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
748 |
+
```
|
749 |
+
|
750 |
+
CUDA:
|
751 |
+
|
752 |
+
```
|
753 |
+
13.69 13.69 13.04 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
754 |
+
```
|
755 |
+
|
756 |
+
CUDA-FP16:
|
757 |
+
|
758 |
+
```
|
759 |
+
16.29 15.77 15.77 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
760 |
+
```
|
761 |
+
|
762 |
+
### Atlas 200I DK
|
763 |
+
|
764 |
+
CPU:
|
765 |
+
|
766 |
+
```
|
767 |
+
88.24 87.00 84.23 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
|
768 |
+
```
|
769 |
+
|
770 |
+
-->
|
benchmark/color_table.svg
CHANGED
|
|
benchmark/config/object_tracking_vittrack.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Benchmark:
|
2 |
+
name: "Object Tracking Benchmark"
|
3 |
+
type: "Tracking"
|
4 |
+
data:
|
5 |
+
type: "TrackingVideoLoader"
|
6 |
+
path: "data/object_tracking"
|
7 |
+
files: ["throw_cup.mp4"]
|
8 |
+
metric:
|
9 |
+
type: "Tracking"
|
10 |
+
backend: "default"
|
11 |
+
target: "cpu"
|
12 |
+
|
13 |
+
Model:
|
14 |
+
name: "VitTrack"
|
benchmark/table_config.yaml
CHANGED
@@ -157,6 +157,13 @@ Models:
|
|
157 |
acceptable_time: 700
|
158 |
keyword: "pose_estimation_mediapipe"
|
159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
Devices:
|
162 |
- name: "Intel 12700K"
|
|
|
157 |
acceptable_time: 700
|
158 |
keyword: "pose_estimation_mediapipe"
|
159 |
|
160 |
+
- name: "VitTrack"
|
161 |
+
task: "Object Tracking"
|
162 |
+
input_size: "1280x720"
|
163 |
+
folder: "object_tracking_vittrack"
|
164 |
+
acceptable_time: 1000
|
165 |
+
keyword: "object_tracking_vittrack"
|
166 |
+
|
167 |
|
168 |
Devices:
|
169 |
- name: "Intel 12700K"
|
models/__init__.py
CHANGED
@@ -19,6 +19,7 @@ from .license_plate_detection_yunet.lpd_yunet import LPD_YuNet
|
|
19 |
from .object_detection_nanodet.nanodet import NanoDet
|
20 |
from .object_detection_yolox.yolox import YoloX
|
21 |
from .facial_expression_recognition.facial_fer_model import FacialExpressionRecog
|
|
|
22 |
|
23 |
class ModuleRegistery:
|
24 |
def __init__(self, name):
|
@@ -92,3 +93,4 @@ MODELS.register(LPD_YuNet)
|
|
92 |
MODELS.register(NanoDet)
|
93 |
MODELS.register(YoloX)
|
94 |
MODELS.register(FacialExpressionRecog)
|
|
|
|
19 |
from .object_detection_nanodet.nanodet import NanoDet
|
20 |
from .object_detection_yolox.yolox import YoloX
|
21 |
from .facial_expression_recognition.facial_fer_model import FacialExpressionRecog
|
22 |
+
from .object_tracking_vittrack.vittrack import VitTrack
|
23 |
|
24 |
class ModuleRegistery:
|
25 |
def __init__(self, name):
|
|
|
93 |
MODELS.register(NanoDet)
|
94 |
MODELS.register(YoloX)
|
95 |
MODELS.register(FacialExpressionRecog)
|
96 |
+
MODELS.register(VitTrack)
|
models/object_tracking_vittrack/README.md
CHANGED
@@ -1,18 +1,19 @@
|
|
1 |
# VIT tracker
|
2 |
|
3 |
-
VIT tracker(vision transformer tracker) is a much better model for
|
4 |
|
5 |
-
video
|
6 |
-
|
|
|
7 |
|
8 |
This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
|
9 |
|
10 |
-
**NOTE: OpenCV > 4.8.0
|
11 |
|
12 |
# Demo
|
13 |
|
14 |
```bash
|
15 |
-
#
|
16 |
python demo.py --input /path/to/video
|
17 |
|
18 |
# get help regarding various parameters
|
@@ -59,4 +60,4 @@ All files in this directory are licensed under [Apache 2.0 License](./LICENSE).
|
|
59 |
|
60 |
OSTrack: https://github.com/botaoye/OSTrack
|
61 |
|
62 |
-
OpenCV Sample: https://github.com/opencv/opencv/blob/4.x/samples/dnn/vit_tracker.cpp
|
|
|
1 |
# VIT tracker
|
2 |
|
3 |
+
VIT tracker(vision transformer tracker) is a much better model for real-time object tracking. VIT tracker can achieve speeds exceeding nanotrack by 20% in single-threaded mode with ARM chip, and the advantage becomes even more pronounced in multi-threaded mode. In addition, on the dataset, vit tracker demonstrates better performance compared to nanotrack. Moreover, vit trackerprovides confidence values during the tracking process, which can be used to determine if the tracking is currently lost.
|
4 |
|
5 |
+
In target tracking tasks, the score is an important indicator that can indicate whether the current target is lost. In the video, vit tracker can track the target and display the current score in the upper left corner of the video. When the target is lost, the score drops significantly. While nanotrack will only return 0.9 score in any situation, so that we cannot determine whether the target is lost.
|
6 |
+
|
7 |
+
Video demo: https://youtu.be/MJiPnu1ZQRI
|
8 |
|
9 |
This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
|
10 |
|
11 |
+
**NOTE: OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.**
|
12 |
|
13 |
# Demo
|
14 |
|
15 |
```bash
|
16 |
+
# tracking on video
|
17 |
python demo.py --input /path/to/video
|
18 |
|
19 |
# get help regarding various parameters
|
|
|
60 |
|
61 |
OSTrack: https://github.com/botaoye/OSTrack
|
62 |
|
63 |
+
OpenCV Sample: https://github.com/opencv/opencv/blob/4.x/samples/dnn/vit_tracker.cpp
|
models/object_tracking_vittrack/demo.py
CHANGED
@@ -1,17 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import numpy as np
|
2 |
import cv2 as cv
|
3 |
-
|
|
|
|
|
4 |
|
5 |
# Check OpenCV version
|
6 |
assert cv.__version__ > "4.8.0", \
|
7 |
"Please install latest opencv-python to try this demo: python3 -m pip install --upgrade opencv-python"
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
parser = argparse.ArgumentParser(
|
10 |
description="VIT track opencv API")
|
11 |
parser.add_argument('--input', '-i', type=str,
|
12 |
help='Usage: Set path to the input video. Omit for using default camera.')
|
13 |
parser.add_argument('--model_path', type=str, default='object_tracking_vittrack_2023sep.onnx',
|
14 |
help='Usage: Set model path')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
args = parser.parse_args()
|
16 |
|
17 |
def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1):
|
@@ -35,16 +63,16 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex
|
|
35 |
return output
|
36 |
|
37 |
if __name__ == '__main__':
|
|
|
|
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
42 |
|
43 |
# Read from args.input
|
44 |
-
_input = args.input
|
45 |
-
if args.input is None:
|
46 |
-
device_id = 0
|
47 |
-
_input = device_id
|
48 |
video = cv.VideoCapture(_input)
|
49 |
|
50 |
# Select an object
|
@@ -75,11 +103,9 @@ if __name__ == '__main__':
|
|
75 |
break
|
76 |
# Inference
|
77 |
tm.start()
|
78 |
-
isLocated, bbox = model.
|
79 |
-
score = model.getTrackingScore()
|
80 |
tm.stop()
|
81 |
# Visualize
|
82 |
frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS())
|
83 |
-
cv.imshow('
|
84 |
tm.reset()
|
85 |
-
|
|
|
1 |
+
# This file is part of OpenCV Zoo project.
|
2 |
+
# It is subject to the license terms in the LICENSE file found in the same directory.
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
|
6 |
import numpy as np
|
7 |
import cv2 as cv
|
8 |
+
|
9 |
+
|
10 |
+
from vittrack import VitTrack
|
11 |
|
12 |
# Check OpenCV version
|
13 |
assert cv.__version__ > "4.8.0", \
|
14 |
"Please install latest opencv-python to try this demo: python3 -m pip install --upgrade opencv-python"
|
15 |
|
16 |
+
# Valid combinations of backends and targets
|
17 |
+
backend_target_pairs = [
|
18 |
+
[cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
|
19 |
+
[cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA],
|
20 |
+
[cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16],
|
21 |
+
[cv.dnn.DNN_BACKEND_TIMVX, cv.dnn.DNN_TARGET_NPU],
|
22 |
+
[cv.dnn.DNN_BACKEND_CANN, cv.dnn.DNN_TARGET_NPU]
|
23 |
+
]
|
24 |
+
|
25 |
parser = argparse.ArgumentParser(
|
26 |
description="VIT track opencv API")
|
27 |
parser.add_argument('--input', '-i', type=str,
|
28 |
help='Usage: Set path to the input video. Omit for using default camera.')
|
29 |
parser.add_argument('--model_path', type=str, default='object_tracking_vittrack_2023sep.onnx',
|
30 |
help='Usage: Set model path')
|
31 |
+
parser.add_argument('--backend_target', '-bt', type=int, default=0,
|
32 |
+
help='''Choose one of the backend-target pair to run this demo:
|
33 |
+
{:d}: (default) OpenCV implementation + CPU,
|
34 |
+
{:d}: CUDA + GPU (CUDA),
|
35 |
+
{:d}: CUDA + GPU (CUDA FP16),
|
36 |
+
{:d}: TIM-VX + NPU,
|
37 |
+
{:d}: CANN + NPU
|
38 |
+
'''.format(*[x for x in range(len(backend_target_pairs))]))
|
39 |
+
parser.add_argument('--save', '-s', action='store_true',
|
40 |
+
help='Usage: Specify to save a file with results. Invalid in case of camera input.')
|
41 |
+
parser.add_argument('--vis', '-v', action='store_true',
|
42 |
+
help='Usage: Specify to open a new window to show results. Invalid in case of camera input.')
|
43 |
args = parser.parse_args()
|
44 |
|
45 |
def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1):
|
|
|
63 |
return output
|
64 |
|
65 |
if __name__ == '__main__':
|
66 |
+
backend_id = backend_target_pairs[args.backend_target][0]
|
67 |
+
target_id = backend_target_pairs[args.backend_target][1]
|
68 |
|
69 |
+
model = VitTrack(
|
70 |
+
model_path=args.model_path,
|
71 |
+
backend_id=backend_id,
|
72 |
+
target_id=target_id)
|
73 |
|
74 |
# Read from args.input
|
75 |
+
_input = 0 if args.input is None else args.input
|
|
|
|
|
|
|
76 |
video = cv.VideoCapture(_input)
|
77 |
|
78 |
# Select an object
|
|
|
103 |
break
|
104 |
# Inference
|
105 |
tm.start()
|
106 |
+
isLocated, bbox, score = model.infer(frame)
|
|
|
107 |
tm.stop()
|
108 |
# Visualize
|
109 |
frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS())
|
110 |
+
cv.imshow('VitTrack Demo', frame)
|
111 |
tm.reset()
|
|
models/object_tracking_vittrack/vittrack.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file is part of OpenCV Zoo project.
|
2 |
+
# It is subject to the license terms in the LICENSE file found in the same directory.
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import cv2 as cv
|
6 |
+
|
7 |
+
class VitTrack:
|
8 |
+
def __init__(self, model_path, backend_id=0, target_id=0):
|
9 |
+
self.model_path = model_path
|
10 |
+
self.backend_id = backend_id
|
11 |
+
self.target_id = target_id
|
12 |
+
|
13 |
+
self.params = cv.TrackerVit_Params()
|
14 |
+
self.params.net = self.model_path
|
15 |
+
self.params.backend = self.backend_id
|
16 |
+
self.params.target = self.target_id
|
17 |
+
|
18 |
+
self.model = cv.TrackerVit_create(self.params)
|
19 |
+
|
20 |
+
@property
|
21 |
+
def name(self):
|
22 |
+
return self.__class__.__name__
|
23 |
+
|
24 |
+
def setBackendAndTarget(self, backend_id, target_id):
|
25 |
+
self.backend_id = backend_id
|
26 |
+
self.target_id = target_id
|
27 |
+
|
28 |
+
self.params.backend = self.backend_id
|
29 |
+
self.params.target = self.target_id
|
30 |
+
|
31 |
+
self.model = cv.TrackerVit_create(self.params)
|
32 |
+
|
33 |
+
def init(self, image, roi):
|
34 |
+
self.model.init(image, roi)
|
35 |
+
|
36 |
+
def infer(self, image):
|
37 |
+
is_located, bbox = self.model.update(image)
|
38 |
+
score = self.model.getTrackingScore()
|
39 |
+
return is_located, bbox, score
|