Wanli commited on
Commit
43ea797
·
1 Parent(s): ac5c83c

remove text detection DB (#221)

Browse files
benchmark/README.md CHANGED
@@ -102,8 +102,6 @@ mean median min input size model
102
  26.37 33.51 21.48 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
103
  10.07 9.68 8.16 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
104
  1.19 1.30 1.07 [100, 100] WeChatQRCode with ['detect_2021nov.prototxt', 'detect_2021nov.caffemodel', 'sr_2021nov.prototxt', 'sr_2021nov.caffemodel']
105
- 80.97 80.06 73.20 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
106
- 80.73 85.47 72.06 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
107
  23.86 24.16 23.26 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
108
  23.94 23.76 23.26 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
109
  26.89 24.78 23.26 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -161,8 +159,6 @@ mean median min input size model
161
  381.72 394.15 308.62 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
162
  194.47 195.18 191.67 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
163
  5.90 5.90 5.81 [100, 100] WeChatQRCode with ['detect_2021nov.prototxt', 'detect_2021nov.caffemodel', 'sr_2021nov.prototxt', 'sr_2021nov.caffemodel']
164
- 2033.55 2454.13 1769.20 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
165
- 1896.61 1977.38 1769.20 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
166
  462.50 463.67 456.98 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
167
  462.97 464.33 456.98 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
168
  470.79 464.35 456.98 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -221,8 +217,6 @@ mean median min input size model
221
  343.35 344.56 333.41 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
222
  89.93 91.58 88.28 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
223
  5.69 5.72 5.66 [100, 100] WeChatQRCode with ['detect_2021nov.prototxt', 'detect_2021nov.caffemodel', 'sr_2021nov.prototxt', 'sr_2021nov.caffemodel']
224
- 1070.55 1072.14 1055.67 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
225
- 1071.56 1071.38 1055.67 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
226
  238.89 238.22 236.97 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
227
  238.41 240.39 236.97 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
228
  276.96 240.19 236.97 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -261,8 +255,6 @@ mean median min input size model
261
  38.16 37.33 37.10 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
262
  91.65 91.98 89.90 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
263
  91.40 92.74 89.76 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
264
- 223.24 224.30 216.37 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
265
- 223.03 222.28 216.37 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
266
  112.35 111.90 109.99 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
267
  112.68 114.63 109.93 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
268
  183.96 112.72 109.93 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -296,8 +288,6 @@ mean median min input size model
296
  153.89 153.96 153.43 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
297
  44.29 44.03 43.62 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
298
  91.28 92.89 89.79 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
299
- 254.78 256.13 245.60 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
300
- 254.98 255.20 245.60 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
301
  427.53 428.67 425.63 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
302
  427.79 429.28 425.63 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
303
  414.07 429.46 387.26 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -350,8 +340,6 @@ mean median min input size model
350
  333.03 346.65 322.37 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
351
  322.95 315.22 303.07 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
352
  127.16 173.93 99.77 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
353
- 975.49 977.45 952.43 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
354
- 970.16 970.83 928.66 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
355
  238.38 241.90 233.21 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
356
  238.05 236.53 232.05 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
357
  262.58 238.47 232.05 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -437,8 +425,6 @@ mean median min input size model
437
  521.46 521.66 520.28 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
438
  541.50 544.02 520.28 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
439
  134.02 136.01 132.06 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
440
- 1441.73 1442.80 1440.26 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
441
- 1436.45 1437.89 1430.58 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
442
  360.26 360.82 359.13 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
443
  361.22 361.51 359.13 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
444
  427.85 362.87 359.13 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -477,8 +463,6 @@ mean median min input size model
477
  5.17 5.26 5.09 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
478
  16.45 16.44 16.31 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
479
  5.58 5.57 5.54 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
480
- 17.15 17.18 16.83 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
481
- 17.95 18.61 16.83 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
482
  ```
483
 
484
  ### Toybrick RV1126
@@ -524,8 +508,6 @@ mean median min input size model
524
  11131.81 11141.37 11080.20 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
525
  7065.00 7461.37 3748.85 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
526
  790.98 823.19 755.99 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
527
- 49331.32 49285.30 49210.67 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
528
- 49327.34 49489.22 49210.67 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
529
  4422.65 4432.92 4376.19 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
530
  4407.88 4405.92 4353.22 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
531
  3782.89 4404.01 2682.63 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -584,8 +566,6 @@ mean median min input size model
584
  146.02 145.89 139.08 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
585
  157.60 158.88 139.08 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
586
  41.26 42.74 40.08 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
587
- 384.47 401.25 360.71 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
588
- 377.91 381.15 336.30 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
589
  110.51 111.04 107.73 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
590
  110.67 111.54 107.73 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
591
  131.52 111.76 107.73 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -644,8 +624,6 @@ mean median min input size model
644
  646.25 647.89 631.03 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
645
  182.57 185.52 179.71 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
646
  9.93 9.97 9.82 [100, 100] WeChatQRCode with ['detect_2021nov.prototxt', 'detect_2021nov.caffemodel', 'sr_2021nov.prototxt', 'sr_2021nov.caffemodel']
647
- 1914.15 1913.70 1902.25 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
648
- 1920.07 1929.80 1902.25 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
649
  495.04 493.75 489.41 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
650
  493.63 491.89 489.41 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
651
  598.94 496.42 489.41 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -704,8 +682,6 @@ mean median min input size model
704
  14925.56 14926.90 14912.28 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
705
  10507.96 10944.15 6974.74 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
706
  1113.51 1124.83 1106.81 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
707
- 66015.47 65997.60 65993.81 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
708
- 66023.14 66034.99 65993.81 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
709
  6094.40 6093.77 6091.85 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
710
  6073.33 6076.77 6055.13 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
711
  5547.32 6057.15 4653.05 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
@@ -763,8 +739,6 @@ mean median min input size model
763
  7594.21 7590.75 7589.16 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
764
  4884.04 5154.38 2715.94 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
765
  548.41 550.86 546.09 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
766
- 34074.19 34077.97 34058.43 [640, 480] DB with ['text_detection_DB_IC15_resnet18_2021sep.onnx']
767
- 34073.67 34069.82 34054.29 [640, 480] DB with ['text_detection_DB_TD500_resnet18_2021sep.onnx']
768
  3031.81 3031.79 3030.41 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
769
  3031.41 3031.17 3029.99 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
770
  2638.47 3031.01 1969.10 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
102
  26.37 33.51 21.48 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
103
  10.07 9.68 8.16 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
104
  1.19 1.30 1.07 [100, 100] WeChatQRCode with ['detect_2021nov.prototxt', 'detect_2021nov.caffemodel', 'sr_2021nov.prototxt', 'sr_2021nov.caffemodel']
 
 
105
  23.86 24.16 23.26 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
106
  23.94 23.76 23.26 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
107
  26.89 24.78 23.26 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
159
  381.72 394.15 308.62 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
160
  194.47 195.18 191.67 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
161
  5.90 5.90 5.81 [100, 100] WeChatQRCode with ['detect_2021nov.prototxt', 'detect_2021nov.caffemodel', 'sr_2021nov.prototxt', 'sr_2021nov.caffemodel']
 
 
162
  462.50 463.67 456.98 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
163
  462.97 464.33 456.98 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
164
  470.79 464.35 456.98 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
217
  343.35 344.56 333.41 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
218
  89.93 91.58 88.28 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
219
  5.69 5.72 5.66 [100, 100] WeChatQRCode with ['detect_2021nov.prototxt', 'detect_2021nov.caffemodel', 'sr_2021nov.prototxt', 'sr_2021nov.caffemodel']
 
 
220
  238.89 238.22 236.97 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
221
  238.41 240.39 236.97 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
222
  276.96 240.19 236.97 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
255
  38.16 37.33 37.10 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
256
  91.65 91.98 89.90 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
257
  91.40 92.74 89.76 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
 
 
258
  112.35 111.90 109.99 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
259
  112.68 114.63 109.93 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
260
  183.96 112.72 109.93 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
288
  153.89 153.96 153.43 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
289
  44.29 44.03 43.62 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
290
  91.28 92.89 89.79 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
 
 
291
  427.53 428.67 425.63 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
292
  427.79 429.28 425.63 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
293
  414.07 429.46 387.26 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
340
  333.03 346.65 322.37 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
341
  322.95 315.22 303.07 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
342
  127.16 173.93 99.77 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
 
 
343
  238.38 241.90 233.21 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
344
  238.05 236.53 232.05 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
345
  262.58 238.47 232.05 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
425
  521.46 521.66 520.28 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
426
  541.50 544.02 520.28 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
427
  134.02 136.01 132.06 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
 
 
428
  360.26 360.82 359.13 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
429
  361.22 361.51 359.13 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
430
  427.85 362.87 359.13 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
463
  5.17 5.26 5.09 [192, 192] MPPalmDet with ['palm_detection_mediapipe_2023feb.onnx']
464
  16.45 16.44 16.31 [224, 224] MPPersonDet with ['person_detection_mediapipe_2023mar.onnx']
465
  5.58 5.57 5.54 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
 
 
466
  ```
467
 
468
  ### Toybrick RV1126
 
508
  11131.81 11141.37 11080.20 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
509
  7065.00 7461.37 3748.85 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
510
  790.98 823.19 755.99 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
 
 
511
  4422.65 4432.92 4376.19 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
512
  4407.88 4405.92 4353.22 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
513
  3782.89 4404.01 2682.63 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
566
  146.02 145.89 139.08 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
567
  157.60 158.88 139.08 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
568
  41.26 42.74 40.08 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
 
 
569
  110.51 111.04 107.73 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
570
  110.67 111.54 107.73 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
571
  131.52 111.76 107.73 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
624
  646.25 647.89 631.03 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
625
  182.57 185.52 179.71 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
626
  9.93 9.97 9.82 [100, 100] WeChatQRCode with ['detect_2021nov.prototxt', 'detect_2021nov.caffemodel', 'sr_2021nov.prototxt', 'sr_2021nov.caffemodel']
 
 
627
  495.04 493.75 489.41 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
628
  493.63 491.89 489.41 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
629
  598.94 496.42 489.41 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
682
  14925.56 14926.90 14912.28 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
683
  10507.96 10944.15 6974.74 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
684
  1113.51 1124.83 1106.81 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
 
 
685
  6094.40 6093.77 6091.85 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
686
  6073.33 6076.77 6055.13 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
687
  5547.32 6057.15 4653.05 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
 
739
  7594.21 7590.75 7589.16 [128, 256] YoutuReID with ['person_reid_youtu_2021nov.onnx']
740
  4884.04 5154.38 2715.94 [128, 256] YoutuReID with ['person_reid_youtu_2021nov_int8.onnx']
741
  548.41 550.86 546.09 [256, 256] MPPose with ['pose_estimation_mediapipe_2023mar.onnx']
 
 
742
  3031.81 3031.79 3030.41 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may.onnx']
743
  3031.41 3031.17 3029.99 [640, 480] PPOCRDet with ['text_detection_en_ppocrv3_2023may.onnx']
744
  2638.47 3031.01 1969.10 [640, 480] PPOCRDet with ['text_detection_cn_ppocrv3_2023may_int8.onnx']
benchmark/color_table.svg CHANGED
benchmark/config/text_detection_db.yaml DELETED
@@ -1,20 +0,0 @@
1
- Benchmark:
2
- name: "Text Detection Benchmark"
3
- type: "Detection"
4
- data:
5
- path: "data/text"
6
- files: ["1.jpg", "2.jpg", "3.jpg"]
7
- sizes: # [[w1, h1], ...], Omit to run at original scale
8
- - [640, 480]
9
- metric:
10
- warmup: 30
11
- repeat: 10
12
- backend: "default"
13
- target: "cpu"
14
-
15
- Model:
16
- name: "DB"
17
- binaryThreshold: 0.3
18
- polygonThreshold: 0.5
19
- maxCandidates: 200
20
- unclipRatio: 2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/table_config.yaml CHANGED
@@ -59,20 +59,6 @@ Models:
59
  acceptable_time: 2000
60
  keyword: "object_detection_nanodet"
61
 
62
- - name: "DB-IC15 (EN)"
63
- task: "Text Detection"
64
- input_size: "640x480"
65
- folder: "text_detection_db"
66
- acceptable_time: 2000
67
- keyword: "text_detection_DB_IC15_resnet18"
68
-
69
- - name: "DB-TD500 (EN&CN)"
70
- task: "Text Detection"
71
- input_size: "640x480"
72
- folder: "text_detection_db"
73
- acceptable_time: 2000
74
- keyword: "text_detection_DB_TD500_resnet18"
75
-
76
  - name: "PPOCRDet-CN"
77
  task: "Text Detection"
78
  input_size: "640x480"
 
59
  acceptable_time: 2000
60
  keyword: "object_detection_nanodet"
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  - name: "PPOCRDet-CN"
63
  task: "Text Detection"
64
  input_size: "640x480"
models/__init__.py CHANGED
@@ -3,7 +3,6 @@ import glob
3
  import os
4
 
5
  from .face_detection_yunet.yunet import YuNet
6
- from .text_detection_db.db import DB
7
  from .text_recognition_crnn.crnn import CRNN
8
  from .face_recognition_sface.sface import SFace
9
  from .image_classification_ppresnet.ppresnet import PPResNet
@@ -78,7 +77,6 @@ class ModuleRegistery:
78
 
79
  MODELS = ModuleRegistery('Models')
80
  MODELS.register(YuNet)
81
- MODELS.register(DB)
82
  MODELS.register(CRNN)
83
  MODELS.register(SFace)
84
  MODELS.register(PPResNet)
 
3
  import os
4
 
5
  from .face_detection_yunet.yunet import YuNet
 
6
  from .text_recognition_crnn.crnn import CRNN
7
  from .face_recognition_sface.sface import SFace
8
  from .image_classification_ppresnet.ppresnet import PPResNet
 
77
 
78
  MODELS = ModuleRegistery('Models')
79
  MODELS.register(YuNet)
 
80
  MODELS.register(CRNN)
81
  MODELS.register(SFace)
82
  MODELS.register(PPResNet)
models/text_detection_db/CMakeLists.txt DELETED
@@ -1,29 +0,0 @@
1
- cmake_minimum_required(VERSION 3.24)
2
- set(project_name "opencv_zoo_text_detection_db")
3
-
4
- PROJECT (${project_name})
5
-
6
- set(OPENCV_VERSION "4.8.0")
7
- set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation")
8
- find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH})
9
- # Find OpenCV, you may need to set OpenCV_DIR variable
10
- # to the absolute path to the directory containing OpenCVConfig.cmake file
11
- # via the command line or GUI
12
-
13
- file(GLOB SourceFile
14
- "demo.cpp")
15
- # If the package has been found, several variables will
16
- # be set, you can find the full list with descriptions
17
- # in the OpenCVConfig.cmake file.
18
- # Print some message showing some of them
19
- message(STATUS "OpenCV library status:")
20
- message(STATUS " config: ${OpenCV_DIR}")
21
- message(STATUS " version: ${OpenCV_VERSION}")
22
- message(STATUS " libraries: ${OpenCV_LIBS}")
23
- message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}")
24
-
25
- # Declare the executable target built from your sources
26
- add_executable(${project_name} ${SourceFile})
27
-
28
- # Link your application with OpenCV libraries
29
- target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/text_detection_db/LICENSE DELETED
@@ -1,202 +0,0 @@
1
-
2
- Apache License
3
- Version 2.0, January 2004
4
- http://www.apache.org/licenses/
5
-
6
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
-
8
- 1. Definitions.
9
-
10
- "License" shall mean the terms and conditions for use, reproduction,
11
- and distribution as defined by Sections 1 through 9 of this document.
12
-
13
- "Licensor" shall mean the copyright owner or entity authorized by
14
- the copyright owner that is granting the License.
15
-
16
- "Legal Entity" shall mean the union of the acting entity and all
17
- other entities that control, are controlled by, or are under common
18
- control with that entity. For the purposes of this definition,
19
- "control" means (i) the power, direct or indirect, to cause the
20
- direction or management of such entity, whether by contract or
21
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
- outstanding shares, or (iii) beneficial ownership of such entity.
23
-
24
- "You" (or "Your") shall mean an individual or Legal Entity
25
- exercising permissions granted by this License.
26
-
27
- "Source" form shall mean the preferred form for making modifications,
28
- including but not limited to software source code, documentation
29
- source, and configuration files.
30
-
31
- "Object" form shall mean any form resulting from mechanical
32
- transformation or translation of a Source form, including but
33
- not limited to compiled object code, generated documentation,
34
- and conversions to other media types.
35
-
36
- "Work" shall mean the work of authorship, whether in Source or
37
- Object form, made available under the License, as indicated by a
38
- copyright notice that is included in or attached to the work
39
- (an example is provided in the Appendix below).
40
-
41
- "Derivative Works" shall mean any work, whether in Source or Object
42
- form, that is based on (or derived from) the Work and for which the
43
- editorial revisions, annotations, elaborations, or other modifications
44
- represent, as a whole, an original work of authorship. For the purposes
45
- of this License, Derivative Works shall not include works that remain
46
- separable from, or merely link (or bind by name) to the interfaces of,
47
- the Work and Derivative Works thereof.
48
-
49
- "Contribution" shall mean any work of authorship, including
50
- the original version of the Work and any modifications or additions
51
- to that Work or Derivative Works thereof, that is intentionally
52
- submitted to Licensor for inclusion in the Work by the copyright owner
53
- or by an individual or Legal Entity authorized to submit on behalf of
54
- the copyright owner. For the purposes of this definition, "submitted"
55
- means any form of electronic, verbal, or written communication sent
56
- to the Licensor or its representatives, including but not limited to
57
- communication on electronic mailing lists, source code control systems,
58
- and issue tracking systems that are managed by, or on behalf of, the
59
- Licensor for the purpose of discussing and improving the Work, but
60
- excluding communication that is conspicuously marked or otherwise
61
- designated in writing by the copyright owner as "Not a Contribution."
62
-
63
- "Contributor" shall mean Licensor and any individual or Legal Entity
64
- on behalf of whom a Contribution has been received by Licensor and
65
- subsequently incorporated within the Work.
66
-
67
- 2. Grant of Copyright License. Subject to the terms and conditions of
68
- this License, each Contributor hereby grants to You a perpetual,
69
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
- copyright license to reproduce, prepare Derivative Works of,
71
- publicly display, publicly perform, sublicense, and distribute the
72
- Work and such Derivative Works in Source or Object form.
73
-
74
- 3. Grant of Patent License. Subject to the terms and conditions of
75
- this License, each Contributor hereby grants to You a perpetual,
76
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
- (except as stated in this section) patent license to make, have made,
78
- use, offer to sell, sell, import, and otherwise transfer the Work,
79
- where such license applies only to those patent claims licensable
80
- by such Contributor that are necessarily infringed by their
81
- Contribution(s) alone or by combination of their Contribution(s)
82
- with the Work to which such Contribution(s) was submitted. If You
83
- institute patent litigation against any entity (including a
84
- cross-claim or counterclaim in a lawsuit) alleging that the Work
85
- or a Contribution incorporated within the Work constitutes direct
86
- or contributory patent infringement, then any patent licenses
87
- granted to You under this License for that Work shall terminate
88
- as of the date such litigation is filed.
89
-
90
- 4. Redistribution. You may reproduce and distribute copies of the
91
- Work or Derivative Works thereof in any medium, with or without
92
- modifications, and in Source or Object form, provided that You
93
- meet the following conditions:
94
-
95
- (a) You must give any other recipients of the Work or
96
- Derivative Works a copy of this License; and
97
-
98
- (b) You must cause any modified files to carry prominent notices
99
- stating that You changed the files; and
100
-
101
- (c) You must retain, in the Source form of any Derivative Works
102
- that You distribute, all copyright, patent, trademark, and
103
- attribution notices from the Source form of the Work,
104
- excluding those notices that do not pertain to any part of
105
- the Derivative Works; and
106
-
107
- (d) If the Work includes a "NOTICE" text file as part of its
108
- distribution, then any Derivative Works that You distribute must
109
- include a readable copy of the attribution notices contained
110
- within such NOTICE file, excluding those notices that do not
111
- pertain to any part of the Derivative Works, in at least one
112
- of the following places: within a NOTICE text file distributed
113
- as part of the Derivative Works; within the Source form or
114
- documentation, if provided along with the Derivative Works; or,
115
- within a display generated by the Derivative Works, if and
116
- wherever such third-party notices normally appear. The contents
117
- of the NOTICE file are for informational purposes only and
118
- do not modify the License. You may add Your own attribution
119
- notices within Derivative Works that You distribute, alongside
120
- or as an addendum to the NOTICE text from the Work, provided
121
- that such additional attribution notices cannot be construed
122
- as modifying the License.
123
-
124
- You may add Your own copyright statement to Your modifications and
125
- may provide additional or different license terms and conditions
126
- for use, reproduction, or distribution of Your modifications, or
127
- for any such Derivative Works as a whole, provided Your use,
128
- reproduction, and distribution of the Work otherwise complies with
129
- the conditions stated in this License.
130
-
131
- 5. Submission of Contributions. Unless You explicitly state otherwise,
132
- any Contribution intentionally submitted for inclusion in the Work
133
- by You to the Licensor shall be under the terms and conditions of
134
- this License, without any additional terms or conditions.
135
- Notwithstanding the above, nothing herein shall supersede or modify
136
- the terms of any separate license agreement you may have executed
137
- with Licensor regarding such Contributions.
138
-
139
- 6. Trademarks. This License does not grant permission to use the trade
140
- names, trademarks, service marks, or product names of the Licensor,
141
- except as required for reasonable and customary use in describing the
142
- origin of the Work and reproducing the content of the NOTICE file.
143
-
144
- 7. Disclaimer of Warranty. Unless required by applicable law or
145
- agreed to in writing, Licensor provides the Work (and each
146
- Contributor provides its Contributions) on an "AS IS" BASIS,
147
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
- implied, including, without limitation, any warranties or conditions
149
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
- PARTICULAR PURPOSE. You are solely responsible for determining the
151
- appropriateness of using or redistributing the Work and assume any
152
- risks associated with Your exercise of permissions under this License.
153
-
154
- 8. Limitation of Liability. In no event and under no legal theory,
155
- whether in tort (including negligence), contract, or otherwise,
156
- unless required by applicable law (such as deliberate and grossly
157
- negligent acts) or agreed to in writing, shall any Contributor be
158
- liable to You for damages, including any direct, indirect, special,
159
- incidental, or consequential damages of any character arising as a
160
- result of this License or out of the use or inability to use the
161
- Work (including but not limited to damages for loss of goodwill,
162
- work stoppage, computer failure or malfunction, or any and all
163
- other commercial damages or losses), even if such Contributor
164
- has been advised of the possibility of such damages.
165
-
166
- 9. Accepting Warranty or Additional Liability. While redistributing
167
- the Work or Derivative Works thereof, You may choose to offer,
168
- and charge a fee for, acceptance of support, warranty, indemnity,
169
- or other liability obligations and/or rights consistent with this
170
- License. However, in accepting such obligations, You may act only
171
- on Your own behalf and on Your sole responsibility, not on behalf
172
- of any other Contributor, and only if You agree to indemnify,
173
- defend, and hold each Contributor harmless for any liability
174
- incurred by, or claims asserted against, such Contributor by reason
175
- of your accepting any such warranty or additional liability.
176
-
177
- END OF TERMS AND CONDITIONS
178
-
179
- APPENDIX: How to apply the Apache License to your work.
180
-
181
- To apply the Apache License to your work, attach the following
182
- boilerplate notice, with the fields enclosed by brackets "[]"
183
- replaced with your own identifying information. (Don't include
184
- the brackets!) The text should be enclosed in the appropriate
185
- comment syntax for the file format. We also recommend that a
186
- file or class name and description of purpose be included on the
187
- same "printed page" as the copyright notice for easier
188
- identification within third-party archives.
189
-
190
- Copyright [yyyy] [name of copyright owner]
191
-
192
- Licensed under the Apache License, Version 2.0 (the "License");
193
- you may not use this file except in compliance with the License.
194
- You may obtain a copy of the License at
195
-
196
- http://www.apache.org/licenses/LICENSE-2.0
197
-
198
- Unless required by applicable law or agreed to in writing, software
199
- distributed under the License is distributed on an "AS IS" BASIS,
200
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
- See the License for the specific language governing permissions and
202
- limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/text_detection_db/README.md DELETED
@@ -1,58 +0,0 @@
1
- # DB
2
-
3
- Real-time Scene Text Detection with Differentiable Binarization
4
-
5
- Note:
6
-
7
- - Models source: [here](https://drive.google.com/drive/folders/1qzNCHfUJOS0NEUOIKn69eCtxdlNPpWbq).
8
- - `IC15` in the filename means the model is trained on [IC15 dataset](https://rrc.cvc.uab.es/?ch=4&com=introduction), which can detect English text instances only.
9
- - `TD500` in the filename means the model is trained on [TD500 dataset](http://www.iapr-tc11.org/mediawiki/index.php/MSRA_Text_Detection_500_Database_(MSRA-TD500)), which can detect both English & Chinese instances.
10
- - Visit https://docs.opencv.org/master/d4/d43/tutorial_dnn_text_spotting.html for more information.
11
-
12
- ## Demo
13
-
14
- ### Python
15
-
16
- Run the following command to try the demo:
17
-
18
- ```shell
19
- # detect on camera input
20
- python demo.py
21
- # detect on an image
22
- python demo.py --input /path/to/image -v
23
-
24
- # get help regarding various parameters
25
- python demo.py --help
26
- ```
27
-
28
- ### C++
29
-
30
- Install latest OpenCV and CMake >= 3.24.0 to get started with:
31
-
32
- ```shell
33
- # A typical and default installation path of OpenCV is /usr/local
34
- cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation .
35
- cmake --build build
36
- # detect on camera input
37
- ./build/opencv_zoo_text_detection_db -m=/path/to/model
38
- # detect on an image
39
- ./build/opencv_zoo_text_detection_db -m=/path/to/model -i=/path/to/image -v
40
- # get help messages
41
- ./build/opencv_zoo_text_detection_db -h
42
- ```
43
-
44
- ### Example outputs
45
-
46
- ![mask](./example_outputs/mask.jpg)
47
-
48
- ![gsoc](./example_outputs/gsoc.jpg)
49
-
50
- ## License
51
-
52
- All files in this directory are licensed under [Apache 2.0 License](./LICENSE).
53
-
54
- ## Reference
55
-
56
- - https://arxiv.org/abs/1911.08947
57
- - https://github.com/MhLiao/DB
58
- - https://docs.opencv.org/master/d4/d43/tutorial_dnn_text_spotting.html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/text_detection_db/db.py DELETED
@@ -1,55 +0,0 @@
1
- # This file is part of OpenCV Zoo project.
2
- # It is subject to the license terms in the LICENSE file found in the same directory.
3
- #
4
- # Copyright (C) 2021, Shenzhen Institute of Artificial Intelligence and Robotics for Society, all rights reserved.
5
- # Third party copyrights are property of their respective owners.
6
-
7
- import numpy as np
8
- import cv2 as cv
9
-
10
- class DB:
11
- def __init__(self, modelPath, inputSize=[736, 736], binaryThreshold=0.3, polygonThreshold=0.5, maxCandidates=200, unclipRatio=2.0, backendId=0, targetId=0):
12
- self._modelPath = modelPath
13
- self._model = cv.dnn_TextDetectionModel_DB(
14
- cv.dnn.readNet(self._modelPath)
15
- )
16
-
17
- self._inputSize = tuple(inputSize) # (w, h)
18
- self._inputHeight = inputSize[0]
19
- self._inputWidth = inputSize[1]
20
- self._binaryThreshold = binaryThreshold
21
- self._polygonThreshold = polygonThreshold
22
- self._maxCandidates = maxCandidates
23
- self._unclipRatio = unclipRatio
24
- self._backendId = backendId
25
- self._targetId = targetId
26
-
27
- self._model.setPreferableBackend(self._backendId)
28
- self._model.setPreferableTarget(self._targetId)
29
-
30
- self._model.setBinaryThreshold(self._binaryThreshold)
31
- self._model.setPolygonThreshold(self._polygonThreshold)
32
- self._model.setUnclipRatio(self._unclipRatio)
33
- self._model.setMaxCandidates(self._maxCandidates)
34
-
35
- self._model.setInputParams(1.0/255.0, self._inputSize, (122.67891434, 116.66876762, 104.00698793))
36
-
37
- @property
38
- def name(self):
39
- return self.__class__.__name__
40
-
41
- def setBackendAndTarget(self, backendId, targetId):
42
- self._backendId = backendId
43
- self._targetId = targetId
44
- self._model.setPreferableBackend(self._backendId)
45
- self._model.setPreferableTarget(self._targetId)
46
-
47
- def setInputSize(self, input_size):
48
- self._inputSize = tuple(input_size)
49
- self._model.setInputParams(1.0/255.0, self._inputSize, (122.67891434, 116.66876762, 104.00698793))
50
-
51
- def infer(self, image):
52
- assert image.shape[0] == self._inputSize[1], '{} (height of input image) != {} (preset height)'.format(image.shape[0], self._inputSize[1])
53
- assert image.shape[1] == self._inputSize[0], '{} (width of input image) != {} (preset width)'.format(image.shape[1], self._inputSize[0])
54
-
55
- return self._model.detect(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/text_detection_db/demo.cpp DELETED
@@ -1,179 +0,0 @@
1
- #include <iostream>
2
-
3
- #include <opencv2/dnn.hpp>
4
- #include <opencv2/imgproc.hpp>
5
- #include <opencv2/highgui.hpp>
6
-
7
- using namespace std;
8
- using namespace cv;
9
- using namespace dnn;
10
-
11
- vector< pair<cv::dnn::Backend, cv::dnn::Target> > backendTargetPairs = {
12
- std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU),
13
- std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA),
14
- std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16),
15
- std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU),
16
- std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU)};
17
-
18
-
19
- std::string keys =
20
- "{ help h | | Print help message. }"
21
- "{ model m | text_detection_DB_IC15_resnet18_2021sep.onnx | Usage: Set model type, defaults to text_detection_DB_IC15_resnet18_2021sep.onnx }"
22
- "{ input i | | Usage: Path to input image or video file. Skip this argument to capture frames from a camera.}"
23
- "{ width | 736 | Usage: Resize input image to certain width, default = 736. It should be multiple by 32.}"
24
- "{ height | 736 | Usage: Resize input image to certain height, default = 736. It should be multiple by 32.}"
25
- "{ binary_threshold | 0.3 | Usage: Threshold of the binary map, default = 0.3.}"
26
- "{ polygon_threshold | 0.5 | Usage: Threshold of polygons, default = 0.5.}"
27
- "{ max_candidates | 200 | Usage: Set maximum number of polygon candidates, default = 200.}"
28
- "{ unclip_ratio | 2.0 | Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.}"
29
- "{ save s | true | Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.}"
30
- "{ viz v | true | Usage: Specify to open a new window to show results. Invalid in case of camera input.}"
31
- "{ backend bt | 0 | Choose one of computation backends: "
32
- "0: (default) OpenCV implementation + CPU, "
33
- "1: CUDA + GPU (CUDA), "
34
- "2: CUDA + GPU (CUDA FP16), "
35
- "3: TIM-VX + NPU, "
36
- "4: CANN + NPU}";
37
-
38
-
39
- class DB {
40
- public:
41
-
42
- DB(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3,
43
- float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0,
44
- dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh),
45
- polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio),
46
- backendId(bId), targetId(tId)
47
- {
48
- this->model = TextDetectionModel_DB(readNet(modelPath));
49
- this->model.setPreferableBackend(backendId);
50
- this->model.setPreferableTarget(targetId);
51
-
52
- this->model.setBinaryThreshold(binaryThreshold);
53
- this->model.setPolygonThreshold(polygonThreshold);
54
- this->model.setUnclipRatio(unclipRatio);
55
- this->model.setMaxCandidates(maxCandidates);
56
-
57
- this->model.setInputParams(1.0 / 255.0, inputSize, Scalar(122.67891434, 116.66876762, 104.00698793));
58
- }
59
- pair< vector<vector<Point>>, vector<float> > infer(Mat image) {
60
- CV_Assert(image.rows == this->inputSize.height && "height of input image != net input size ");
61
- CV_Assert(image.cols == this->inputSize.width && "width of input image != net input size ");
62
- vector<vector<Point>> pt;
63
- vector<float> confidence;
64
- this->model.detect(image, pt, confidence);
65
- return make_pair< vector<vector<Point>> &, vector< float > &>(pt, confidence);
66
- }
67
-
68
- private:
69
- string modelPath;
70
- TextDetectionModel_DB model;
71
- Size inputSize;
72
- float binaryThreshold;
73
- float polygonThreshold;
74
- int maxCandidates;
75
- double unclipRatio;
76
- dnn::Backend backendId;
77
- dnn::Target targetId;
78
-
79
- };
80
-
81
- Mat visualize(Mat image, pair< vector<vector<Point>>, vector<float> >&results, double fps=-1, Scalar boxColor=Scalar(0, 255, 0), Scalar textColor=Scalar(0, 0, 255), bool isClosed=true, int thickness=2)
82
- {
83
- Mat output;
84
- image.copyTo(output);
85
- if (fps > 0)
86
- putText(output, format("FPS: %.2f", fps), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor);
87
- polylines(output, results.first, isClosed, boxColor, thickness);
88
- return output;
89
- }
90
-
91
- int main(int argc, char** argv)
92
- {
93
- CommandLineParser parser(argc, argv, keys);
94
-
95
- parser.about("Use this program to run Real-time Scene Text Detection with Differentiable Binarization in opencv Zoo using OpenCV.");
96
- if (parser.has("help"))
97
- {
98
- parser.printMessage();
99
- return 0;
100
- }
101
-
102
- int backendTargetid = parser.get<int>("backend");
103
- String modelName = parser.get<String>("model");
104
-
105
- if (modelName.empty())
106
- {
107
- CV_Error(Error::StsError, "Model file " + modelName + " not found");
108
- }
109
-
110
- Size inpSize(parser.get<int>("width"), parser.get<int>("height"));
111
- float binThresh = parser.get<float>("binary_threshold");
112
- float polyThresh = parser.get<float>("polygon_threshold");
113
- int maxCand = parser.get<int>("max_candidates");
114
- double unRatio = parser.get<float>("unclip_ratio");
115
- bool save = parser.get<bool>("save");
116
- bool viz = parser.get<float>("viz");
117
-
118
- DB model(modelName, inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
119
-
120
- //! [Open a video file or an image file or a camera stream]
121
- VideoCapture cap;
122
- if (parser.has("input"))
123
- cap.open(parser.get<String>("input"));
124
- else
125
- cap.open(0);
126
- if (!cap.isOpened())
127
- CV_Error(Error::StsError, "Cannot opend video or file");
128
- Mat originalImage;
129
- static const std::string kWinName = modelName;
130
- while (waitKey(1) < 0)
131
- {
132
- cap >> originalImage;
133
- if (originalImage.empty())
134
- {
135
- cout << "Frame is empty" << endl;
136
- waitKey();
137
- break;
138
- }
139
- int originalW = originalImage.cols;
140
- int originalH = originalImage.rows;
141
- double scaleHeight = originalH / double(inpSize.height);
142
- double scaleWidth = originalW / double(inpSize.width);
143
- Mat image;
144
- resize(originalImage, image, inpSize);
145
-
146
- // inference
147
- TickMeter tm;
148
- tm.start();
149
- pair< vector<vector<Point>>, vector<float> > results = model.infer(image);
150
- tm.stop();
151
- auto x = results.first;
152
- // Scale the results bounding box
153
- for (auto &pts : results.first)
154
- {
155
- for (int i = 0; i < 4; i++)
156
- {
157
- pts[i].x = int(pts[i].x * scaleWidth);
158
- pts[i].y = int(pts[i].y * scaleHeight);
159
- }
160
- }
161
- originalImage = visualize(originalImage, results, tm.getFPS());
162
- tm.reset();
163
- if (parser.has("input"))
164
- {
165
- if (save)
166
- {
167
- cout << "Result image saved to result.jpg\n";
168
- imwrite("result.jpg", originalImage);
169
- }
170
- if (viz)
171
- imshow(kWinName, originalImage);
172
- }
173
- else
174
- imshow(kWinName, originalImage);
175
- }
176
- return 0;
177
- }
178
-
179
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/text_detection_db/demo.py DELETED
@@ -1,154 +0,0 @@
1
- # This file is part of OpenCV Zoo project.
2
- # It is subject to the license terms in the LICENSE file found in the same directory.
3
- #
4
- # Copyright (C) 2021, Shenzhen Institute of Artificial Intelligence and Robotics for Society, all rights reserved.
5
- # Third party copyrights are property of their respective owners.
6
-
7
- import argparse
8
-
9
- import numpy as np
10
- import cv2 as cv
11
-
12
- from db import DB
13
-
14
- # Check OpenCV version
15
- assert cv.__version__ >= "4.8.0", \
16
- "Please install latest opencv-python to try this demo: python3 -m pip install --upgrade opencv-python"
17
-
18
- # Valid combinations of backends and targets
19
- backend_target_pairs = [
20
- [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
21
- [cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA],
22
- [cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16],
23
- [cv.dnn.DNN_BACKEND_TIMVX, cv.dnn.DNN_TARGET_NPU],
24
- [cv.dnn.DNN_BACKEND_CANN, cv.dnn.DNN_TARGET_NPU]
25
- ]
26
-
27
- parser = argparse.ArgumentParser(description='Real-time Scene Text Detection with Differentiable Binarization (https://arxiv.org/abs/1911.08947).')
28
- parser.add_argument('--input', '-i', type=str,
29
- help='Usage: Set path to the input image. Omit for using default camera.')
30
- parser.add_argument('--model', '-m', type=str, default='text_detection_DB_TD500_resnet18_2021sep.onnx',
31
- help='Usage: Set model path, defaults to text_detection_DB_TD500_resnet18_2021sep.onnx.')
32
- parser.add_argument('--backend_target', '-bt', type=int, default=0,
33
- help='''Choose one of the backend-target pair to run this demo:
34
- {:d}: (default) OpenCV implementation + CPU,
35
- {:d}: CUDA + GPU (CUDA),
36
- {:d}: CUDA + GPU (CUDA FP16),
37
- {:d}: TIM-VX + NPU,
38
- {:d}: CANN + NPU
39
- '''.format(*[x for x in range(len(backend_target_pairs))]))
40
- parser.add_argument('--width', type=int, default=736,
41
- help='Usage: Resize input image to certain width, default = 736. It should be multiple by 32.')
42
- parser.add_argument('--height', type=int, default=736,
43
- help='Usage: Resize input image to certain height, default = 736. It should be multiple by 32.')
44
- parser.add_argument('--binary_threshold', type=float, default=0.3,
45
- help='Usage: Threshold of the binary map, default = 0.3.')
46
- parser.add_argument('--polygon_threshold', type=float, default=0.5,
47
- help='Usage: Threshold of polygons, default = 0.5.')
48
- parser.add_argument('--max_candidates', type=int, default=200,
49
- help='Usage: Set maximum number of polygon candidates, default = 200.')
50
- parser.add_argument('--unclip_ratio', type=np.float64, default=2.0,
51
- help=' Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.')
52
- parser.add_argument('--save', '-s', action='store_true',
53
- help='Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.')
54
- parser.add_argument('--vis', '-v', action='store_true',
55
- help='Usage: Specify to open a new window to show results. Invalid in case of camera input.')
56
- args = parser.parse_args()
57
-
58
- def visualize(image, results, box_color=(0, 255, 0), text_color=(0, 0, 255), isClosed=True, thickness=2, fps=None):
59
- output = image.copy()
60
-
61
- if fps is not None:
62
- cv.putText(output, 'FPS: {:.2f}'.format(fps), (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, text_color)
63
-
64
- pts = np.array(results[0])
65
- output = cv.polylines(output, pts, isClosed, box_color, thickness)
66
-
67
- return output
68
-
69
- if __name__ == '__main__':
70
- backend_id = backend_target_pairs[args.backend_target][0]
71
- target_id = backend_target_pairs[args.backend_target][1]
72
-
73
- # Instantiate DB
74
- model = DB(modelPath=args.model,
75
- inputSize=[args.width, args.height],
76
- binaryThreshold=args.binary_threshold,
77
- polygonThreshold=args.polygon_threshold,
78
- maxCandidates=args.max_candidates,
79
- unclipRatio=args.unclip_ratio,
80
- backendId=backend_id,
81
- targetId=target_id)
82
-
83
- # If input is an image
84
- if args.input is not None:
85
- original_image = cv.imread(args.input)
86
- original_w = original_image.shape[1]
87
- original_h = original_image.shape[0]
88
- scaleHeight = original_h / args.height
89
- scaleWidth = original_w / args.width
90
- image = cv.resize(original_image, [args.width, args.height])
91
-
92
- # Inference
93
- results = model.infer(image)
94
-
95
- # Scale the results bounding box
96
- for i in range(len(results[0])):
97
- for j in range(4):
98
- box = results[0][i][j]
99
- results[0][i][j][0] = box[0] * scaleWidth
100
- results[0][i][j][1] = box[1] * scaleHeight
101
-
102
- # Print results
103
- print('{} texts detected.'.format(len(results[0])))
104
- for idx, (bbox, score) in enumerate(zip(results[0], results[1])):
105
- print('{}: {} {} {} {}, {:.2f}'.format(idx, bbox[0], bbox[1], bbox[2], bbox[3], score))
106
-
107
- # Draw results on the input image
108
- original_image = visualize(original_image, results)
109
-
110
- # Save results if save is true
111
- if args.save:
112
- print('Resutls saved to result.jpg\n')
113
- cv.imwrite('result.jpg', original_image)
114
-
115
- # Visualize results in a new window
116
- if args.vis:
117
- cv.namedWindow(args.input, cv.WINDOW_AUTOSIZE)
118
- cv.imshow(args.input, original_image)
119
- cv.waitKey(0)
120
- else: # Omit input to call default camera
121
- deviceId = 0
122
- cap = cv.VideoCapture(deviceId)
123
-
124
- tm = cv.TickMeter()
125
- while cv.waitKey(1) < 0:
126
- hasFrame, original_image = cap.read()
127
- if not hasFrame:
128
- print('No frames grabbed!')
129
- break
130
-
131
- original_w = original_image.shape[1]
132
- original_h = original_image.shape[0]
133
- scaleHeight = original_h / args.height
134
- scaleWidth = original_w / args.width
135
- frame = cv.resize(original_image, [args.width, args.height])
136
- # Inference
137
- tm.start()
138
- results = model.infer(frame) # results is a tuple
139
- tm.stop()
140
-
141
- # Scale the results bounding box
142
- for i in range(len(results[0])):
143
- for j in range(4):
144
- box = results[0][i][j]
145
- results[0][i][j][0] = box[0] * scaleWidth
146
- results[0][i][j][1] = box[1] * scaleHeight
147
-
148
- # Draw results on the input image
149
- original_image = visualize(original_image, results, fps=tm.getFPS())
150
-
151
- # Visualize results in a new Window
152
- cv.imshow('{} Demo'.format(model.name), original_image)
153
-
154
- tm.reset()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/text_recognition_crnn/demo.cpp CHANGED
@@ -41,10 +41,10 @@ std::string keys =
41
  "4: CANN + NPU}";
42
 
43
 
44
- class DB {
45
  public:
46
 
47
- DB(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3,
48
  float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0,
49
  dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh),
50
  polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio),
@@ -215,7 +215,7 @@ int main(int argc, char** argv)
215
  bool save = parser.get<bool>("save");
216
  bool viz = parser.get<float>("viz");
217
 
218
- DB detector("../text_detection_db/text_detection_DB_IC15_resnet18_2021sep.onnx", inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
219
  CRNN recognizer(modelPath, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
220
  //! [Open a video file or an image file or a camera stream]
221
  VideoCapture cap;
@@ -232,9 +232,13 @@ int main(int argc, char** argv)
232
  cap >> originalImage;
233
  if (originalImage.empty())
234
  {
235
- cout << "Frame is empty" << endl;
236
- waitKey();
237
- break;
 
 
 
 
238
  }
239
  int originalW = originalImage.cols;
240
  int originalH = originalImage.rows;
 
41
  "4: CANN + NPU}";
42
 
43
 
44
+ class PPOCRDet {
45
  public:
46
 
47
+ PPOCRDet(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3,
48
  float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0,
49
  dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh),
50
  polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio),
 
215
  bool save = parser.get<bool>("save");
216
  bool viz = parser.get<float>("viz");
217
 
218
+ PPOCRDet detector("../text_detection_ppocr/text_detection_en_ppocrv3_2023may.onnx", inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
219
  CRNN recognizer(modelPath, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
220
  //! [Open a video file or an image file or a camera stream]
221
  VideoCapture cap;
 
232
  cap >> originalImage;
233
  if (originalImage.empty())
234
  {
235
+ if (parser.has("input"))
236
+ {
237
+ cout << "Frame is empty" << endl;
238
+ break;
239
+ }
240
+ else
241
+ continue;
242
  }
243
  int originalW = originalImage.cols;
244
  int originalH = originalImage.rows;
models/text_recognition_crnn/demo.py CHANGED
@@ -12,8 +12,8 @@ import cv2 as cv
12
 
13
  from crnn import CRNN
14
 
15
- sys.path.append('../text_detection_db')
16
- from db import DB
17
 
18
  # Check OpenCV version
19
  assert cv.__version__ >= "4.8.0", \
@@ -65,8 +65,8 @@ if __name__ == '__main__':
65
  backend_id = backend_target_pairs[args.backend_target][0]
66
  target_id = backend_target_pairs[args.backend_target][1]
67
 
68
- # Instantiate DB for text detection
69
- detector = DB(modelPath='../text_detection_db/text_detection_DB_IC15_resnet18_2021sep.onnx',
70
  inputSize=[args.width, args.height],
71
  binaryThreshold=0.3,
72
  polygonThreshold=0.5,
 
12
 
13
  from crnn import CRNN
14
 
15
+ sys.path.append('../text_detection_ppocr')
16
+ from ppocr_det import PPOCRDet
17
 
18
  # Check OpenCV version
19
  assert cv.__version__ >= "4.8.0", \
 
65
  backend_id = backend_target_pairs[args.backend_target][0]
66
  target_id = backend_target_pairs[args.backend_target][1]
67
 
68
+ # Instantiate PPOCRDet for text detection
69
+ detector = PPOCRDet(modelPath='../text_detection_ppocr/text_detection_en_ppocrv3_2023may.onnx',
70
  inputSize=[args.width, args.height],
71
  binaryThreshold=0.3,
72
  polygonThreshold=0.5,