benchmark: add vittrack benchmark results (#214)

* create vittrack.py for benchmark; modify demo.py to use VitTrack

* add benchmark config

* add benchmark results on some hardware

* update benchmark results on jetson, jetson orin, vim4

* update results on sunrise x3, 200i dk, rv1126

* update benchmark results on atlas 200 and maix

* improve doc; update results on cuda

* doc correction

Files changed (8) hide show

benchmark/README.md +56 -3
benchmark/color_table.svg +187 -7
benchmark/config/object_tracking_vittrack.yaml +14 -0
benchmark/table_config.yaml +7 -0
models/__init__.py +2 -0
models/object_tracking_vittrack/README.md +7 -6
models/object_tracking_vittrack/demo.py +38 -12
models/object_tracking_vittrack/vittrack.py +39 -0

benchmark/README.md CHANGED Viewed

@@ -94,6 +94,7 @@ mean       median     min        input size   model
 46.10      47.53      43.06      [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 144.89     149.58     125.71     [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 143.83     146.39     119.75     [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 12.99      13.11      12.14      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 12.64      12.44      10.82      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 12.64      11.83      11.03      [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -148,6 +149,7 @@ mean       median     min        input size   model
 212.90     212.93     209.55     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 1690.06    2303.34    1480.63    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 1489.54    1435.48    1308.12    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 356.63     357.29     354.42     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 217.52     229.39     101.61     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 198.63     198.25     196.68     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -203,6 +205,7 @@ mean       median     min        input size   model
 216.18     216.19     214.30     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 1207.83    1208.71    1203.64    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 1236.98    1250.21    1203.64    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 124.89     125.25     124.53     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 107.99     109.82     94.05      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 108.41     108.33     107.91     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -241,6 +244,7 @@ mean       median     min        input size   model
 54.24      55.24      52.87      [320, 240]   LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
 63.63      63.43      63.32      [416, 416]   NanoDet with ['object_detection_nanodet_2022nov.onnx']
 371.45     378.00     366.39     [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 33.85      33.90      33.61      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 38.16      37.33      37.10      [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 91.65      91.98      89.90      [128, 256]   YoutuReID with ['person_reid_youtu_2021nov.onnx']
@@ -271,6 +275,7 @@ mean       median     min        input size   model
 366.46     366.88     363.46     [320, 240]   LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
 163.06     163.34     161.77     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov.onnx']
 301.10     311.52     297.74     [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 149.37     149.95     148.01     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 153.89     153.96     153.43     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 44.29      44.03      43.62      [128, 256]   YoutuReID with ['person_reid_youtu_2021nov.onnx']
@@ -318,6 +323,7 @@ mean       median     min        input size   model
 212.69     262.75     170.88     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 1110.87    1112.27    1085.31    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 1128.73    1157.12    1085.31    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 147.01     144.01     139.27     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 119.70     118.95     94.09      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 107.63     107.09     105.61     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -398,6 +404,7 @@ mean       median     min        input size   model
 322.98     323.45     312.13     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 1875.33    1877.53    1871.26    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 1989.04    2005.25    1871.26    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 159.80     159.62     159.40     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 152.18     152.86     145.56     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 145.83     145.77     145.45     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -418,8 +425,10 @@ mean       median     min        input size   model
 NPU (CANN):
 ```
-$ python3 benchmark.py --all --fp32 --cfg_exclude wechat:crnn --model_exclude pose_estimation_mediapipe_2023mar.onnx --cfg_overwrite_backend_target 4
 Benchmarking ...
 backend=cv.dnn.DNN_BACKEND_CANN
 target=cv.dnn.DNN_TARGET_NPU
@@ -478,6 +487,7 @@ mean       median     min        input size   model
 1903.82    1962.71    1533.79    [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 37604.10   37569.30   37502.48   [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 24229.20   25577.94   13483.54   [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 1133.44    1131.54    1124.83    [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 883.96     919.07     655.33     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 1430.98    1424.55    1415.68    [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -533,6 +543,7 @@ mean       median     min        input size   model
 117.28     150.31     83.33      [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 553.58     558.76     535.47     [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 594.18     592.64     535.47     [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 56.35      55.73      55.25      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 57.07      57.19      55.25      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 47.94      48.41      47.05      [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -587,6 +598,7 @@ mean       median     min        input size   model
 406.28     416.58     385.68     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 2608.90    2612.42    2597.93    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 2609.88    2609.39    2597.93    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 228.95     228.74     228.35     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 227.97     228.61     226.76     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 192.29     192.26     191.74     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -643,6 +655,7 @@ mean       median     min        input size   model
 3002.36    3047.94    2655.38    [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 50678.08   50651.82   50651.19   [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 36249.71   37771.22   24606.37   [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 1502.15    1501.98    1500.99    [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 1300.15    1320.44    1137.60    [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 1993.05    1993.98    1991.86    [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -669,9 +682,8 @@ Specs: [details_cn](https://doc.rvspace.org/VisionFive2/PB/VisionFive_2/specific
 CPU:
 <!-- config wechat is excluded due to it needs building with opencv_contrib -->
-<!-- config vittrack is excluded due to opencv cannot find ffmpeg and its components -->
 ```
-$ python3 benchmark.py --all --cfg_exclude wechat:vittrack
 Benchmarking ...
 backend=cv.dnn.DNN_BACKEND_OPENCV
 target=cv.dnn.DNN_TARGET_CPU
@@ -698,6 +710,7 @@ mean       median     min        input size   model
 1434.56    1463.32    1194.57    [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 26172.62   26160.04   26151.67   [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 17004.06   17909.88   9659.54    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
 734.97     735.58     733.95     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 609.61     621.69     508.04     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 961.41     962.26     960.39     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
@@ -715,3 +728,43 @@ mean       median     min        input size   model
 1237.00    1395.68    807.66     [1280, 720]  CRNN with ['text_recognition_CRNN_CN_2021nov_int8.onnx']
 1169.59    1415.29    774.09     [1280, 720]  CRNN with ['text_recognition_CRNN_EN_2022oct_int8.onnx']
 ```

 46.10      47.53      43.06      [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 144.89     149.58     125.71     [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 143.83     146.39     119.75     [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+12.52      14.47      11.63      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 12.99      13.11      12.14      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 12.64      12.44      10.82      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 12.64      11.83      11.03      [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 212.90     212.93     209.55     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 1690.06    2303.34    1480.63    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 1489.54    1435.48    1308.12    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+90.49      89.23      86.83      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 356.63     357.29     354.42     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 217.52     229.39     101.61     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 198.63     198.25     196.68     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 216.18     216.19     214.30     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 1207.83    1208.71    1203.64    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 1236.98    1250.21    1203.64    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+123.30     125.37     116.69     [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 124.89     125.25     124.53     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 107.99     109.82     94.05      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 108.41     108.33     107.91     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 54.24      55.24      52.87      [320, 240]   LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
 63.63      63.43      63.32      [416, 416]   NanoDet with ['object_detection_nanodet_2022nov.onnx']
 371.45     378.00     366.39     [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
+43.06      42.32      39.92      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 33.85      33.90      33.61      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 38.16      37.33      37.10      [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 91.65      91.98      89.90      [128, 256]   YoutuReID with ['person_reid_youtu_2021nov.onnx']
 366.46     366.88     363.46     [320, 240]   LPD_YuNet with ['license_plate_detection_lpd_yunet_2023mar.onnx']
 163.06     163.34     161.77     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov.onnx']
 301.10     311.52     297.74     [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
+43.36      40.65      39.85      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 149.37     149.95     148.01     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 153.89     153.96     153.43     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 44.29      44.03      43.62      [128, 256]   YoutuReID with ['person_reid_youtu_2021nov.onnx']
 212.69     262.75     170.88     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 1110.87    1112.27    1085.31    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 1128.73    1157.12    1085.31    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+67.31      67.41      66.23      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 147.01     144.01     139.27     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 119.70     118.95     94.09      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 107.63     107.09     105.61     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 322.98     323.45     312.13     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 1875.33    1877.53    1871.26    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 1989.04    2005.25    1871.26    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+143.62     143.19     137.16     [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 159.80     159.62     159.40     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 152.18     152.86     145.56     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 145.83     145.77     145.45     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 NPU (CANN):
+<!-- vittrack is excluded due to HardSwish is not supported by CANN backend yet -->
 ```
+$ python3 benchmark.py --all --fp32 --cfg_exclude wechat:crnn:vittrack --model_exclude pose_estimation_mediapipe_2023mar.onnx --cfg_overwrite_backend_target 4
 Benchmarking ...
 backend=cv.dnn.DNN_BACKEND_CANN
 target=cv.dnn.DNN_TARGET_NPU
 1903.82    1962.71    1533.79    [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 37604.10   37569.30   37502.48   [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 24229.20   25577.94   13483.54   [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+415.72     403.04     399.44     [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 1133.44    1131.54    1124.83    [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 883.96     919.07     655.33     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 1430.98    1424.55    1415.68    [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 117.28     150.31     83.33      [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 553.58     558.76     535.47     [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 594.18     592.64     535.47     [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+49.47      49.21      48.84      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 56.35      55.73      55.25      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 57.07      57.19      55.25      [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 47.94      48.41      47.05      [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 406.28     416.58     385.68     [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 2608.90    2612.42    2597.93    [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 2609.88    2609.39    2597.93    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+189.23     188.72     182.28     [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 228.95     228.74     228.35     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 227.97     228.61     226.76     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 192.29     192.26     191.74     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 3002.36    3047.94    2655.38    [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 50678.08   50651.82   50651.19   [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 36249.71   37771.22   24606.37   [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+707.79     706.32     699.40     [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 1502.15    1501.98    1500.99    [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 1300.15    1320.44    1137.60    [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 1993.05    1993.98    1991.86    [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 CPU:
 <!-- config wechat is excluded due to it needs building with opencv_contrib -->
 ```
+$ python3 benchmark.py --all --cfg_exclude wechat
 Benchmarking ...
 backend=cv.dnn.DNN_BACKEND_OPENCV
 target=cv.dnn.DNN_TARGET_CPU
 1434.56    1463.32    1194.57    [416, 416]   NanoDet with ['object_detection_nanodet_2022nov_int8.onnx']
 26172.62   26160.04   26151.67   [640, 640]   YoloX with ['object_detection_yolox_2022nov.onnx']
 17004.06   17909.88   9659.54    [640, 640]   YoloX with ['object_detection_yolox_2022nov_int8.onnx']
+304.58     309.56     280.05     [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
 734.97     735.58     733.95     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
 609.61     621.69     508.04     [192, 192]   MPPalmDet with ['palm_detection_mediapipe_2023feb_int8.onnx']
 961.41     962.26     960.39     [224, 224]   MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
 1237.00    1395.68    807.66     [1280, 720]  CRNN with ['text_recognition_CRNN_CN_2021nov_int8.onnx']
 1169.59    1415.29    774.09     [1280, 720]  CRNN with ['text_recognition_CRNN_EN_2022oct_int8.onnx']
 ```
+<!--
+### Khadas VIM4
+CPU:
+```
+67.65      67.84      66.39      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
+```
+### NVIDIA Jetson Orin Nano
+CPU:
+```
+59.30      58.45      57.90      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
+```
+CUDA:
+```
+13.69      13.69      13.04      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
+```
+CUDA-FP16:
+```
+16.29      15.77      15.77      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
+```
+### Atlas 200I DK
+CPU:
+```
+88.24      87.00      84.23      [1280, 720]  VitTrack with ['object_tracking_vittrack_2023sep.onnx']
+```
+-->

benchmark/color_table.svg CHANGED Viewed

benchmark/config/object_tracking_vittrack.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+Benchmark:
+  name: "Object Tracking Benchmark"
+  type: "Tracking"
+  data:
+    type: "TrackingVideoLoader"
+    path: "data/object_tracking"
+    files: ["throw_cup.mp4"]
+  metric:
+    type: "Tracking"
+  backend: "default"
+  target: "cpu"
+Model:
+  name: "VitTrack"

benchmark/table_config.yaml CHANGED Viewed

@@ -157,6 +157,13 @@ Models:
     acceptable_time: 700
     keyword: "pose_estimation_mediapipe"
 Devices:
   - name: "Intel 12700K"

     acceptable_time: 700
     keyword: "pose_estimation_mediapipe"
+  - name: "VitTrack"
+    task: "Object Tracking"
+    input_size: "1280x720"
+    folder: "object_tracking_vittrack"
+    acceptable_time: 1000
+    keyword: "object_tracking_vittrack"
 Devices:
   - name: "Intel 12700K"

models/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@ from .license_plate_detection_yunet.lpd_yunet import LPD_YuNet
 from .object_detection_nanodet.nanodet import NanoDet
 from .object_detection_yolox.yolox import YoloX
 from .facial_expression_recognition.facial_fer_model import FacialExpressionRecog
 class ModuleRegistery:
     def __init__(self, name):
@@ -92,3 +93,4 @@ MODELS.register(LPD_YuNet)
 MODELS.register(NanoDet)
 MODELS.register(YoloX)
 MODELS.register(FacialExpressionRecog)

 from .object_detection_nanodet.nanodet import NanoDet
 from .object_detection_yolox.yolox import YoloX
 from .facial_expression_recognition.facial_fer_model import FacialExpressionRecog
+from .object_tracking_vittrack.vittrack import VitTrack
 class ModuleRegistery:
     def __init__(self, name):
 MODELS.register(NanoDet)
 MODELS.register(YoloX)
 MODELS.register(FacialExpressionRecog)
+MODELS.register(VitTrack)

models/object_tracking_vittrack/README.md CHANGED Viewed

@@ -1,18 +1,19 @@
 # VIT tracker
-VIT tracker(vision transformer tracker) is a much better model for  real-time object tracking. VIT tracker can achieve speeds exceeding  nanotrack by 20% in single-threaded mode with ARM chip, and the  advantage becomes even more pronounced in multi-threaded mode. In  addition, on the dataset, vit tracker demonstrates better performance  compared to nanotrack. Moreover, vit trackerprovides confidence values  during the tracking process, which can be used to determine if the  tracking is currently lost.
-video demo: https://youtu.be/MJiPnu1ZQRI
- In target tracking tasks, the score is an important indicator that can  indicate whether the current target is lost. In the video, vit tracker  can track the target and display the current score in the upper left  corner of the video. When the target is lost, the score drops  significantly. While nanotrack will only return 0.9 score in any  situation, so that we cannot determine whether the target is lost.
 This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
-**NOTE: OpenCV > 4.8.0**
 # Demo
 ```bash
-#  tracking on video
 python demo.py --input /path/to/video
 # get help regarding various parameters
@@ -59,4 +60,4 @@ All files in this directory are licensed under [Apache 2.0 License](./LICENSE).
 OSTrack: https://github.com/botaoye/OSTrack
-OpenCV Sample: https://github.com/opencv/opencv/blob/4.x/samples/dnn/vit_tracker.cpp

 # VIT tracker
+VIT tracker(vision transformer tracker) is a much better model for real-time object tracking. VIT tracker can achieve speeds exceeding nanotrack by 20% in single-threaded mode with ARM chip, and the advantage becomes even more pronounced in multi-threaded mode. In addition, on the dataset, vit tracker demonstrates better performance compared to nanotrack. Moreover, vit trackerprovides confidence values during the tracking process, which can be used to determine if the tracking is currently lost.
+In target tracking tasks, the score is an important indicator that can indicate whether the current target is lost. In the video, vit tracker can track the target and display the current score in the upper left corner of the video. When the target is lost, the score drops significantly. While nanotrack will only return 0.9 score in any situation, so that we cannot determine whether the target is lost.
+Video demo: https://youtu.be/MJiPnu1ZQRI
 This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC 2023 project [**Realtime object tracking models**](https://github.com/opencv/opencv/wiki/GSoC_2023#idea-realtime-object-tracking-models)
+**NOTE: OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.**
 # Demo
 ```bash
+# tracking on video
 python demo.py --input /path/to/video
 # get help regarding various parameters
 OSTrack: https://github.com/botaoye/OSTrack
+OpenCV Sample: https://github.com/opencv/opencv/blob/4.x/samples/dnn/vit_tracker.cpp

models/object_tracking_vittrack/demo.py CHANGED Viewed

@@ -1,17 +1,45 @@
 import numpy as np
 import cv2 as cv
-import argparse
 # Check OpenCV version
 assert cv.__version__ > "4.8.0", \
        "Please install latest opencv-python to try this demo: python3 -m pip install --upgrade opencv-python"
 parser = argparse.ArgumentParser(
     description="VIT track opencv API")
 parser.add_argument('--input', '-i', type=str,
                     help='Usage: Set path to the input video. Omit for using default camera.')
 parser.add_argument('--model_path', type=str, default='object_tracking_vittrack_2023sep.onnx',
                     help='Usage: Set model path')
 args = parser.parse_args()
 def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1):
@@ -35,16 +63,16 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex
     return output
 if __name__ == '__main__':
-    params = cv.TrackerVit_Params()
-    params.net = args.model_path
-    model = cv.TrackerVit_create(params)
     # Read from args.input
-    _input = args.input
-    if args.input is None:
-        device_id = 0
-        _input = device_id
     video = cv.VideoCapture(_input)
     # Select an object
@@ -75,11 +103,9 @@ if __name__ == '__main__':
             break
         # Inference
         tm.start()
-        isLocated, bbox = model.update(frame)
-        score = model.getTrackingScore()
         tm.stop()
         # Visualize
         frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS())
-        cv.imshow('vittrack Demo', frame)
         tm.reset()

+# This file is part of OpenCV Zoo project.
+# It is subject to the license terms in the LICENSE file found in the same directory.
+import argparse
 import numpy as np
 import cv2 as cv
+from vittrack import VitTrack
 # Check OpenCV version
 assert cv.__version__ > "4.8.0", \
        "Please install latest opencv-python to try this demo: python3 -m pip install --upgrade opencv-python"
+# Valid combinations of backends and targets
+backend_target_pairs = [
+    [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
+    [cv.dnn.DNN_BACKEND_CUDA,   cv.dnn.DNN_TARGET_CUDA],
+    [cv.dnn.DNN_BACKEND_CUDA,   cv.dnn.DNN_TARGET_CUDA_FP16],
+    [cv.dnn.DNN_BACKEND_TIMVX,  cv.dnn.DNN_TARGET_NPU],
+    [cv.dnn.DNN_BACKEND_CANN,   cv.dnn.DNN_TARGET_NPU]
+]
 parser = argparse.ArgumentParser(
     description="VIT track opencv API")
 parser.add_argument('--input', '-i', type=str,
                     help='Usage: Set path to the input video. Omit for using default camera.')
 parser.add_argument('--model_path', type=str, default='object_tracking_vittrack_2023sep.onnx',
                     help='Usage: Set model path')
+parser.add_argument('--backend_target', '-bt', type=int, default=0,
+                    help='''Choose one of the backend-target pair to run this demo:
+                        {:d}: (default) OpenCV implementation + CPU,
+                        {:d}: CUDA + GPU (CUDA),
+                        {:d}: CUDA + GPU (CUDA FP16),
+                        {:d}: TIM-VX + NPU,
+                        {:d}: CANN + NPU
+                    '''.format(*[x for x in range(len(backend_target_pairs))]))
+parser.add_argument('--save', '-s', action='store_true',
+                    help='Usage: Specify to save a file with results. Invalid in case of camera input.')
+parser.add_argument('--vis', '-v', action='store_true',
+                    help='Usage: Specify to open a new window to show results. Invalid in case of camera input.')
 args = parser.parse_args()
 def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1):
     return output
 if __name__ == '__main__':
+    backend_id = backend_target_pairs[args.backend_target][0]
+    target_id = backend_target_pairs[args.backend_target][1]
+    model = VitTrack(
+        model_path=args.model_path,
+        backend_id=backend_id,
+        target_id=target_id)
     # Read from args.input
+    _input = 0 if args.input is None else args.input
     video = cv.VideoCapture(_input)
     # Select an object
             break
         # Inference
         tm.start()
+        isLocated, bbox, score = model.infer(frame)
         tm.stop()
         # Visualize
         frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS())
+        cv.imshow('VitTrack Demo', frame)
         tm.reset()

models/object_tracking_vittrack/vittrack.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# This file is part of OpenCV Zoo project.
+# It is subject to the license terms in the LICENSE file found in the same directory.
+import numpy as np
+import cv2 as cv
+class VitTrack:
+    def __init__(self, model_path, backend_id=0, target_id=0):
+        self.model_path = model_path
+        self.backend_id = backend_id
+        self.target_id = target_id
+        self.params = cv.TrackerVit_Params()
+        self.params.net = self.model_path
+        self.params.backend = self.backend_id
+        self.params.target = self.target_id
+        self.model = cv.TrackerVit_create(self.params)
+    @property
+    def name(self):
+        return self.__class__.__name__
+    def setBackendAndTarget(self, backend_id, target_id):
+        self.backend_id = backend_id
+        self.target_id = target_id
+        self.params.backend = self.backend_id
+        self.params.target = self.target_id
+        self.model = cv.TrackerVit_create(self.params)
+    def init(self, image, roi):
+        self.model.init(image, roi)
+    def infer(self, image):
+        is_located, bbox = self.model.update(image)
+        score = self.model.getTrackingScore()
+        return is_located, bbox, score