ytfeng commited on
Commit
3b8b9d5
·
1 Parent(s): 7c978be

benchmark: add vittrack benchmark results (#214)

Browse files

* create vittrack.py for benchmark; modify demo.py to use VitTrack

* add benchmark config

* add benchmark results on some hardware

* update benchmark results on jetson, jetson orin, vim4

* update results on sunrise x3, 200i dk, rv1126

* update benchmark results on atlas 200 and maix

* improve doc; update results on cuda

* doc correction

benchmark/README.md CHANGED
@@ -94,6 +94,7 @@ mean median min input size model
94
  46.10 47.53 43.06 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
95
  144.89 149.58 125.71 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
96
  143.83 146.39 119.75 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
97
  12.99 13.11 12.14 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
98
  12.64 12.44 10.82 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
99
  12.64 11.83 11.03 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -148,6 +149,7 @@ mean median min input size model
148
  212.90 212.93 209.55 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
149
  1690.06 2303.34 1480.63 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
150
  1489.54 1435.48 1308.12 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
151
  356.63 357.29 354.42 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
152
  217.52 229.39 101.61 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
153
  198.63 198.25 196.68 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -203,6 +205,7 @@ mean median min input size model
203
  216.18 216.19 214.30 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
204
  1207.83 1208.71 1203.64 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
205
  1236.98 1250.21 1203.64 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
206
  124.89 125.25 124.53 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
207
  107.99 109.82 94.05 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
208
  108.41 108.33 107.91 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -241,6 +244,7 @@ mean median min input size model
241
  54.24 55.24 52.87 [320, 240] LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
242
  63.63 63.43 63.32 [416, 416] NanoDet with ['object_detection_nanodet_2022nov.onnx']
243
  371.45 378.00 366.39 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
 
244
  33.85 33.90 33.61 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
245
  38.16 37.33 37.10 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
246
  91.65 91.98 89.90 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
@@ -271,6 +275,7 @@ mean median min input size model
271
  366.46 366.88 363.46 [320, 240] LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
272
  163.06 163.34 161.77 [416, 416] NanoDet with ['object_detection_nanodet_2022nov.onnx']
273
  301.10 311.52 297.74 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
 
274
  149.37 149.95 148.01 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
275
  153.89 153.96 153.43 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
276
  44.29 44.03 43.62 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
@@ -318,6 +323,7 @@ mean median min input size model
318
  212.69 262.75 170.88 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
319
  1110.87 1112.27 1085.31 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
320
  1128.73 1157.12 1085.31 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
321
  147.01 144.01 139.27 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
322
  119.70 118.95 94.09 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
323
  107.63 107.09 105.61 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -398,6 +404,7 @@ mean median min input size model
398
  322.98 323.45 312.13 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
399
  1875.33 1877.53 1871.26 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
400
  1989.04 2005.25 1871.26 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
401
  159.80 159.62 159.40 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
402
  152.18 152.86 145.56 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
403
  145.83 145.77 145.45 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -418,8 +425,10 @@ mean median min input size model
418
 
419
  NPU (CANN):
420
 
 
 
421
  ```
422
- $ python3 benchmark.py --all --fp32 --cfg_exclude wechat:crnn --model_exclude pose_estimation_mediapipe_2023mar.onnx --cfg_overwrite_backend_target 4
423
  Benchmarking ...
424
  backend=cv.dnn.DNN_BACKEND_CANN
425
  target=cv.dnn.DNN_TARGET_NPU
@@ -478,6 +487,7 @@ mean median min input size model
478
  1903.82 1962.71 1533.79 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
479
  37604.10 37569.30 37502.48 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
480
  24229.20 25577.94 13483.54 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
481
  1133.44 1131.54 1124.83 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
482
  883.96 919.07 655.33 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
483
  1430.98 1424.55 1415.68 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -533,6 +543,7 @@ mean median min input size model
533
  117.28 150.31 83.33 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
534
  553.58 558.76 535.47 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
535
  594.18 592.64 535.47 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
536
  56.35 55.73 55.25 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
537
  57.07 57.19 55.25 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
538
  47.94 48.41 47.05 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -587,6 +598,7 @@ mean median min input size model
587
  406.28 416.58 385.68 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
588
  2608.90 2612.42 2597.93 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
589
  2609.88 2609.39 2597.93 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
590
  228.95 228.74 228.35 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
591
  227.97 228.61 226.76 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
592
  192.29 192.26 191.74 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -643,6 +655,7 @@ mean median min input size model
643
  3002.36 3047.94 2655.38 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
644
  50678.08 50651.82 50651.19 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
645
  36249.71 37771.22 24606.37 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
646
  1502.15 1501.98 1500.99 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
647
  1300.15 1320.44 1137.60 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
648
  1993.05 1993.98 1991.86 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -669,9 +682,8 @@ Specs: [details_cn](https://doc.rvspace.org/VisionFive2/PB/VisionFive_2/specific
669
 
670
  CPU:
671
  <!-- config wechat is excluded due to it needs building with opencv_contrib -->
672
- <!-- config vittrack is excluded due to opencv cannot find ffmpeg and its components -->
673
  ```
674
- $ python3 benchmark.py --all --cfg_exclude wechat:vittrack
675
  Benchmarking ...
676
  backend=cv.dnn.DNN_BACKEND_OPENCV
677
  target=cv.dnn.DNN_TARGET_CPU
@@ -698,6 +710,7 @@ mean median min input size model
698
  1434.56 1463.32 1194.57 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
699
  26172.62 26160.04 26151.67 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
700
  17004.06 17909.88 9659.54 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 
701
  734.97 735.58 733.95 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
702
  609.61 621.69 508.04 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
703
  961.41 962.26 960.39 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -715,3 +728,43 @@ mean median min input size model
715
  1237.00 1395.68 807.66 [1280, 720] CRNN with ['text_recognition_CRNN_CN_2021nov_int8.onnx']
716
  1169.59 1415.29 774.09 [1280, 720] CRNN with ['text_recognition_CRNN_EN_2022oct_int8.onnx']
717
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  46.10 47.53 43.06 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
95
  144.89 149.58 125.71 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
96
  143.83 146.39 119.75 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
97
+ 12.52 14.47 11.63 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
98
  12.99 13.11 12.14 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
99
  12.64 12.44 10.82 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
100
  12.64 11.83 11.03 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
149
  212.90 212.93 209.55 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
150
  1690.06 2303.34 1480.63 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
151
  1489.54 1435.48 1308.12 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
152
+ 90.49 89.23 86.83 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
153
  356.63 357.29 354.42 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
154
  217.52 229.39 101.61 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
155
  198.63 198.25 196.68 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
205
  216.18 216.19 214.30 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
206
  1207.83 1208.71 1203.64 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
207
  1236.98 1250.21 1203.64 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
208
+ 123.30 125.37 116.69 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
209
  124.89 125.25 124.53 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
210
  107.99 109.82 94.05 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
211
  108.41 108.33 107.91 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
244
  54.24 55.24 52.87 [320, 240] LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
245
  63.63 63.43 63.32 [416, 416] NanoDet with ['object_detection_nanodet_2022nov.onnx']
246
  371.45 378.00 366.39 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
247
+ 43.06 42.32 39.92 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
248
  33.85 33.90 33.61 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
249
  38.16 37.33 37.10 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
250
  91.65 91.98 89.90 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
 
275
  366.46 366.88 363.46 [320, 240] LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
276
  163.06 163.34 161.77 [416, 416] NanoDet with ['object_detection_nanodet_2022nov.onnx']
277
  301.10 311.52 297.74 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
278
+ 43.36 40.65 39.85 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
279
  149.37 149.95 148.01 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
280
  153.89 153.96 153.43 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
281
  44.29 44.03 43.62 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
 
323
  212.69 262.75 170.88 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
324
  1110.87 1112.27 1085.31 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
325
  1128.73 1157.12 1085.31 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
326
+ 67.31 67.41 66.23 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
327
  147.01 144.01 139.27 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
328
  119.70 118.95 94.09 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
329
  107.63 107.09 105.61 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
404
  322.98 323.45 312.13 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
405
  1875.33 1877.53 1871.26 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
406
  1989.04 2005.25 1871.26 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
407
+ 143.62 143.19 137.16 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
408
  159.80 159.62 159.40 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
409
  152.18 152.86 145.56 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
410
  145.83 145.77 145.45 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
425
 
426
  NPU (CANN):
427
 
428
+ <!-- vittrack is excluded due to HardSwish is not supported by CANN backend yet -->
429
+
430
  ```
431
+ $ python3 benchmark.py --all --fp32 --cfg_exclude wechat:crnn:vittrack --model_exclude pose_estimation_mediapipe_2023mar.onnx --cfg_overwrite_backend_target 4
432
  Benchmarking ...
433
  backend=cv.dnn.DNN_BACKEND_CANN
434
  target=cv.dnn.DNN_TARGET_NPU
 
487
  1903.82 1962.71 1533.79 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
488
  37604.10 37569.30 37502.48 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
489
  24229.20 25577.94 13483.54 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
490
+ 415.72 403.04 399.44 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
491
  1133.44 1131.54 1124.83 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
492
  883.96 919.07 655.33 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
493
  1430.98 1424.55 1415.68 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
543
  117.28 150.31 83.33 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
544
  553.58 558.76 535.47 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
545
  594.18 592.64 535.47 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
546
+ 49.47 49.21 48.84 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
547
  56.35 55.73 55.25 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
548
  57.07 57.19 55.25 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
549
  47.94 48.41 47.05 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
598
  406.28 416.58 385.68 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
599
  2608.90 2612.42 2597.93 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
600
  2609.88 2609.39 2597.93 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
601
+ 189.23 188.72 182.28 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
602
  228.95 228.74 228.35 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
603
  227.97 228.61 226.76 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
604
  192.29 192.26 191.74 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
655
  3002.36 3047.94 2655.38 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
656
  50678.08 50651.82 50651.19 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
657
  36249.71 37771.22 24606.37 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
658
+ 707.79 706.32 699.40 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
659
  1502.15 1501.98 1500.99 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
660
  1300.15 1320.44 1137.60 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
661
  1993.05 1993.98 1991.86 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
682
 
683
  CPU:
684
  <!-- config wechat is excluded due to it needs building with opencv_contrib -->
 
685
  ```
686
+ $ python3 benchmark.py --all --cfg_exclude wechat
687
  Benchmarking ...
688
  backend=cv.dnn.DNN_BACKEND_OPENCV
689
  target=cv.dnn.DNN_TARGET_CPU
 
710
  1434.56 1463.32 1194.57 [416, 416] NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
711
  26172.62 26160.04 26151.67 [640, 640] YoloX with ['object_detection_yolox_2022nov.onnx']
712
  17004.06 17909.88 9659.54 [640, 640] YoloX with ['object_detection_yolox_2022nov_int8.onnx']
713
+ 304.58 309.56 280.05 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
714
  734.97 735.58 733.95 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
715
  609.61 621.69 508.04 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
716
  961.41 962.26 960.39 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 
728
  1237.00 1395.68 807.66 [1280, 720] CRNN with ['text_recognition_CRNN_CN_2021nov_int8.onnx']
729
  1169.59 1415.29 774.09 [1280, 720] CRNN with ['text_recognition_CRNN_EN_2022oct_int8.onnx']
730
  ```
731
+
732
+ <!--
733
+
734
+ ### Khadas VIM4
735
+
736
+ CPU:
737
+
738
+ ```
739
+ 67.65 67.84 66.39 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
740
+ ```
741
+
742
+ ### NVIDIA Jetson Orin Nano
743
+
744
+ CPU:
745
+
746
+ ```
747
+ 59.30 58.45 57.90 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
748
+ ```
749
+
750
+ CUDA:
751
+
752
+ ```
753
+ 13.69 13.69 13.04 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
754
+ ```
755
+
756
+ CUDA-FP16:
757
+
758
+ ```
759
+ 16.29 15.77 15.77 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
760
+ ```
761
+
762
+ ### Atlas 200I DK
763
+
764
+ CPU:
765
+
766
+ ```
767
+ 88.24 87.00 84.23 [1280, 720] VitTrack with ['object_tracking_vittrack_2023sep.onnx']
768
+ ```
769
+
770
+ -->
benchmark/color_table.svg CHANGED
benchmark/config/object_tracking_vittrack.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Benchmark:
2
+ name: "Object Tracking Benchmark"
3
+ type: "Tracking"
4
+ data:
5
+ type: "TrackingVideoLoader"
6
+ path: "data/object_tracking"
7
+ files: ["throw_cup.mp4"]
8
+ metric:
9
+ type: "Tracking"
10
+ backend: "default"
11
+ target: "cpu"
12
+
13
+ Model:
14
+ name: "VitTrack"
benchmark/table_config.yaml CHANGED
@@ -157,6 +157,13 @@ Models:
157
  acceptable_time: 700
158
  keyword: "pose_estimation_mediapipe"
159
 
 
 
 
 
 
 
 
160
 
161
  Devices:
162
  - name: "Intel 12700K"
 
157
  acceptable_time: 700
158
  keyword: "pose_estimation_mediapipe"
159
 
160
+ - name: "VitTrack"
161
+ task: "Object Tracking"
162
+ input_size: "1280x720"
163
+ folder: "object_tracking_vittrack"
164
+ acceptable_time: 1000
165
+ keyword: "object_tracking_vittrack"
166
+
167
 
168
  Devices:
169
  - name: "Intel 12700K"
models/__init__.py CHANGED
@@ -19,6 +19,7 @@ from .license_plate_detection_yunet.lpd_yunet import LPD_YuNet
19
  from .object_detection_nanodet.nanodet import NanoDet
20
  from .object_detection_yolox.yolox import YoloX
21
  from .facial_expression_recognition.facial_fer_model import FacialExpressionRecog
 
22
 
23
  class ModuleRegistery:
24
  def __init__(self, name):
@@ -92,3 +93,4 @@ MODELS.register(LPD_YuNet)
92
  MODELS.register(NanoDet)
93
  MODELS.register(YoloX)
94
  MODELS.register(FacialExpressionRecog)
 
 
19
  from .object_detection_nanodet.nanodet import NanoDet
20
  from .object_detection_yolox.yolox import YoloX
21
  from .facial_expression_recognition.facial_fer_model import FacialExpressionRecog
22
+ from .object_tracking_vittrack.vittrack import VitTrack
23
 
24
  class ModuleRegistery:
25
  def __init__(self, name):
 
93
  MODELS.register(NanoDet)
94
  MODELS.register(YoloX)
95
  MODELS.register(FacialExpressionRecog)
96
+ MODELS.register(VitTrack)
models/object_tracking_vittrack/README.md CHANGED
@@ -1,18 +1,19 @@
1
  # VIT tracker
2
 
3
- VIT tracker(vision transformer tracker) is a much better model for real-time object tracking. VIT tracker can achieve speeds exceeding nanotrack by 20% in single-threaded mode with ARM chip, and the advantage becomes even more pronounced in multi-threaded mode. In addition, on the dataset, vit tracker demonstrates better performance compared to nanotrack. Moreover, vit trackerprovides confidence values during the tracking process, which can be used to determine if the tracking is currently lost.
4
 
5
- video demo: https://youtu.be/MJiPnu1ZQRI
6
- In target tracking tasks, the score is an important indicator that can indicate whether the current target is lost. In the video, vit tracker can track the target and display the current score in the upper left corner of the video. When the target is lost, the score drops significantly. While nanotrack will only return 0.9 score in any situation, so that we cannot determine whether the target is lost.
 
7
 
8
  This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
9
 
10
- **NOTE: OpenCV > 4.8.0**
11
 
12
  # Demo
13
 
14
  ```bash
15
- # tracking on video
16
  python demo.py --input /path/to/video
17
 
18
  # get help regarding various parameters
@@ -59,4 +60,4 @@ All files in this directory are licensed under [Apache 2.0 License](./LICENSE).
59
 
60
  OSTrack: https://github.com/botaoye/OSTrack
61
 
62
- OpenCV Sample: https://github.com/opencv/opencv/blob/4.x/samples/dnn/vit_tracker.cpp
 
1
  # VIT tracker
2
 
3
+ VIT tracker(vision transformer tracker) is a much better model for real-time object tracking. VIT tracker can achieve speeds exceeding nanotrack by 20% in single-threaded mode with ARM chip, and the advantage becomes even more pronounced in multi-threaded mode. In addition, on the dataset, vit tracker demonstrates better performance compared to nanotrack. Moreover, vit trackerprovides confidence values during the tracking process, which can be used to determine if the tracking is currently lost.
4
 
5
+ In target tracking tasks, the score is an important indicator that can indicate whether the current target is lost. In the video, vit tracker can track the target and display the current score in the upper left corner of the video. When the target is lost, the score drops significantly. While nanotrack will only return 0.9 score in any situation, so that we cannot determine whether the target is lost.
6
+
7
+ Video demo: https://youtu.be/MJiPnu1ZQRI
8
 
9
  This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
10
 
11
+ **NOTE: OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.**
12
 
13
  # Demo
14
 
15
  ```bash
16
+ # tracking on video
17
  python demo.py --input /path/to/video
18
 
19
  # get help regarding various parameters
 
60
 
61
  OSTrack: https://github.com/botaoye/OSTrack
62
 
63
+ OpenCV Sample: https://github.com/opencv/opencv/blob/4.x/samples/dnn/vit_tracker.cpp
models/object_tracking_vittrack/demo.py CHANGED
@@ -1,17 +1,45 @@
 
 
 
 
 
1
  import numpy as np
2
  import cv2 as cv
3
- import argparse
 
 
4
 
5
  # Check OpenCV version
6
  assert cv.__version__ > "4.8.0", \
7
  "Please install latest opencv-python to try this demo: python3 -m pip install --upgrade opencv-python"
8
 
 
 
 
 
 
 
 
 
 
9
  parser = argparse.ArgumentParser(
10
  description="VIT track opencv API")
11
  parser.add_argument('--input', '-i', type=str,
12
  help='Usage: Set path to the input video. Omit for using default camera.')
13
  parser.add_argument('--model_path', type=str, default='object_tracking_vittrack_2023sep.onnx',
14
  help='Usage: Set model path')
 
 
 
 
 
 
 
 
 
 
 
 
15
  args = parser.parse_args()
16
 
17
  def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1):
@@ -35,16 +63,16 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex
35
  return output
36
 
37
  if __name__ == '__main__':
 
 
38
 
39
- params = cv.TrackerVit_Params()
40
- params.net = args.model_path
41
- model = cv.TrackerVit_create(params)
 
42
 
43
  # Read from args.input
44
- _input = args.input
45
- if args.input is None:
46
- device_id = 0
47
- _input = device_id
48
  video = cv.VideoCapture(_input)
49
 
50
  # Select an object
@@ -75,11 +103,9 @@ if __name__ == '__main__':
75
  break
76
  # Inference
77
  tm.start()
78
- isLocated, bbox = model.update(frame)
79
- score = model.getTrackingScore()
80
  tm.stop()
81
  # Visualize
82
  frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS())
83
- cv.imshow('vittrack Demo', frame)
84
  tm.reset()
85
-
 
1
+ # This file is part of OpenCV Zoo project.
2
+ # It is subject to the license terms in the LICENSE file found in the same directory.
3
+
4
+ import argparse
5
+
6
  import numpy as np
7
  import cv2 as cv
8
+
9
+
10
+ from vittrack import VitTrack
11
 
12
  # Check OpenCV version
13
  assert cv.__version__ > "4.8.0", \
14
  "Please install latest opencv-python to try this demo: python3 -m pip install --upgrade opencv-python"
15
 
16
+ # Valid combinations of backends and targets
17
+ backend_target_pairs = [
18
+ [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
19
+ [cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA],
20
+ [cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16],
21
+ [cv.dnn.DNN_BACKEND_TIMVX, cv.dnn.DNN_TARGET_NPU],
22
+ [cv.dnn.DNN_BACKEND_CANN, cv.dnn.DNN_TARGET_NPU]
23
+ ]
24
+
25
  parser = argparse.ArgumentParser(
26
  description="VIT track opencv API")
27
  parser.add_argument('--input', '-i', type=str,
28
  help='Usage: Set path to the input video. Omit for using default camera.')
29
  parser.add_argument('--model_path', type=str, default='object_tracking_vittrack_2023sep.onnx',
30
  help='Usage: Set model path')
31
+ parser.add_argument('--backend_target', '-bt', type=int, default=0,
32
+ help='''Choose one of the backend-target pair to run this demo:
33
+ {:d}: (default) OpenCV implementation + CPU,
34
+ {:d}: CUDA + GPU (CUDA),
35
+ {:d}: CUDA + GPU (CUDA FP16),
36
+ {:d}: TIM-VX + NPU,
37
+ {:d}: CANN + NPU
38
+ '''.format(*[x for x in range(len(backend_target_pairs))]))
39
+ parser.add_argument('--save', '-s', action='store_true',
40
+ help='Usage: Specify to save a file with results. Invalid in case of camera input.')
41
+ parser.add_argument('--vis', '-v', action='store_true',
42
+ help='Usage: Specify to open a new window to show results. Invalid in case of camera input.')
43
  args = parser.parse_args()
44
 
45
  def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1):
 
63
  return output
64
 
65
  if __name__ == '__main__':
66
+ backend_id = backend_target_pairs[args.backend_target][0]
67
+ target_id = backend_target_pairs[args.backend_target][1]
68
 
69
+ model = VitTrack(
70
+ model_path=args.model_path,
71
+ backend_id=backend_id,
72
+ target_id=target_id)
73
 
74
  # Read from args.input
75
+ _input = 0 if args.input is None else args.input
 
 
 
76
  video = cv.VideoCapture(_input)
77
 
78
  # Select an object
 
103
  break
104
  # Inference
105
  tm.start()
106
+ isLocated, bbox, score = model.infer(frame)
 
107
  tm.stop()
108
  # Visualize
109
  frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS())
110
+ cv.imshow('VitTrack Demo', frame)
111
  tm.reset()
 
models/object_tracking_vittrack/vittrack.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is part of OpenCV Zoo project.
2
+ # It is subject to the license terms in the LICENSE file found in the same directory.
3
+
4
+ import numpy as np
5
+ import cv2 as cv
6
+
7
+ class VitTrack:
8
+ def __init__(self, model_path, backend_id=0, target_id=0):
9
+ self.model_path = model_path
10
+ self.backend_id = backend_id
11
+ self.target_id = target_id
12
+
13
+ self.params = cv.TrackerVit_Params()
14
+ self.params.net = self.model_path
15
+ self.params.backend = self.backend_id
16
+ self.params.target = self.target_id
17
+
18
+ self.model = cv.TrackerVit_create(self.params)
19
+
20
+ @property
21
+ def name(self):
22
+ return self.__class__.__name__
23
+
24
+ def setBackendAndTarget(self, backend_id, target_id):
25
+ self.backend_id = backend_id
26
+ self.target_id = target_id
27
+
28
+ self.params.backend = self.backend_id
29
+ self.params.target = self.target_id
30
+
31
+ self.model = cv.TrackerVit_create(self.params)
32
+
33
+ def init(self, image, roi):
34
+ self.model.init(image, roi)
35
+
36
+ def infer(self, image):
37
+ is_located, bbox = self.model.update(image)
38
+ score = self.model.getTrackingScore()
39
+ return is_located, bbox, score